Updates to blktap driver and user code.
authorakw27@arcadians.cl.cam.ac.uk <akw27@arcadians.cl.cam.ac.uk>
Sun, 4 Sep 2005 21:19:44 +0000 (21:19 +0000)
committerakw27@arcadians.cl.cam.ac.uk <akw27@arcadians.cl.cam.ac.uk>
Sun, 4 Sep 2005 21:19:44 +0000 (21:19 +0000)
Mostly this makes the tap code work again with all of the changes that
have happened to the block drivers recently.  We now use a shared page
per VBD (to the driver), and handle control information through the
store.  The taplib interfaces have changed to be based around per-vbd
data structures that you can attach arbitrary handlers for.

There is also initial code for a user-level blockback driver, which
aims to get around the use of loopbacks for file-based vbds.  Still
plenty of work to do here -- this is a working incremental checkin and
I'm away from this for the next four weeks.

Signed-off-by: Andrew Warfield <andrew.warfield@cl.cam.ac.uk>
22 files changed:
.hgignore
linux-2.6-xen-sparse/drivers/xen/blktap/Makefile
linux-2.6-xen-sparse/drivers/xen/blktap/blktap.c
linux-2.6-xen-sparse/drivers/xen/blktap/blktap_controlmsg.c [deleted file]
linux-2.6-xen-sparse/drivers/xen/blktap/blktap_datapath.c [deleted file]
linux-2.6-xen-sparse/drivers/xen/blktap/blktap_userdev.c [deleted file]
linux-2.6-xen-sparse/drivers/xen/blktap/common.h [new file with mode: 0644]
linux-2.6-xen-sparse/drivers/xen/blktap/interface.c [new file with mode: 0644]
linux-2.6-xen-sparse/drivers/xen/blktap/xenbus.c [new file with mode: 0644]
linux-2.6-xen-sparse/mm/memory.c
tools/blktap/Makefile
tools/blktap/README.sept05 [new file with mode: 0644]
tools/blktap/blkdump.c
tools/blktap/blkif.c [new file with mode: 0644]
tools/blktap/blktaplib.c
tools/blktap/blktaplib.h
tools/blktap/list.h [new file with mode: 0644]
tools/blktap/ublkback/Makefile [new file with mode: 0644]
tools/blktap/ublkback/ublkback.c [new file with mode: 0644]
tools/blktap/ublkback/ublkbacklib.c [new file with mode: 0644]
tools/blktap/ublkback/ublkbacklib.h [new file with mode: 0644]
tools/blktap/xenbus.c [new file with mode: 0644]

index c99199863536da16d6d8134df3e66f432856eaf6..05465739db02f253cba0ea1c68f051ae3bdf12f2 100644 (file)
--- a/.hgignore
+++ b/.hgignore
@@ -82,6 +82,7 @@
 ^tools/blktap/parallax/vdi_validate$
 ^tools/blktap/parallax/parallax$
 ^tools/blktap/parallax/blockstored$
+^tools/blktap/ublkback/ublkback$
 ^tools/blktap/xen/.*$
 ^tools/check/\..*$
 ^tools/cmdline/.*$
index 80b7ca0627e66979969c9369b8e6b55192dda9d4..822b35f4b7d28f470d527adddd2e32ed628e5f9f 100644 (file)
@@ -1,3 +1,3 @@
 
-obj-y  := blktap_userdev.o blktap_datapath.o blktap_controlmsg.o blktap.o 
+obj-y  := xenbus.o interface.o blktap.o 
 
index 1e40fb2dbe7d2237f6a42386871ebd2075b0a10a..4d0b88c8ee377610f0ecf1537934ce9c047f8ba4 100644 (file)
 /******************************************************************************
- * blktap.c
+ * arch/xen/drivers/blkif/blktap/blktap.c
  * 
- * XenLinux virtual block-device tap.
+ * This is a modified version of the block backend driver that remaps requests
+ * to a user-space memory region.  It is intended to be used to write 
+ * application-level servers that provide block interfaces to client VMs.
  * 
- * Copyright (c) 2004, Andrew Warfield
- *
- * Based on the original split block driver:
- * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
- * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
- * Copyright (c) 2004, Christian Limpach
- * 
- * Note that unlike the split block driver code, this driver has been developed
- * strictly for Linux 2.6
  */
 
-#include "blktap.h"
+#include <linux/kernel.h>
+#include <linux/spinlock.h>
+#include <asm-xen/balloon.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/miscdevice.h>
+#include <linux/errno.h>
+#include <linux/major.h>
+#include <linux/gfp.h>
+#include <linux/poll.h>
+#include <asm/tlbflush.h>
+#include "common.h"
+
+/* Only one process may open /dev/xen/blktap at any time. */
+static unsigned long blktap_dev_inuse;
+unsigned long blktap_ring_ok; /* make this ring->state */
+
+/* Rings up to user space. */
+static blkif_front_ring_t blktap_ufe_ring;
+
+/* for poll: */
+static wait_queue_head_t blktap_wait;
+
+/* current switching mode */
+static unsigned long blktap_mode;
+
+/* local prototypes */
+static int blktap_read_ufe_ring(void);
+
+
+/* /dev/xen/blktap resides at device number major=10, minor=200        */ 
+#define BLKTAP_MINOR 202
+
+/* blktap IOCTLs:                                                      */
+#define BLKTAP_IOCTL_KICK_FE         1
+#define BLKTAP_IOCTL_KICK_BE         2 /* currently unused */
+#define BLKTAP_IOCTL_SETMODE         3
+#define BLKTAP_IOCTL_PRINT_IDXS      100  
+
+/* blktap switching modes: (Set with BLKTAP_IOCTL_SETMODE)             */
+#define BLKTAP_MODE_PASSTHROUGH      0x00000000  /* default            */
+#define BLKTAP_MODE_INTERCEPT_FE     0x00000001
+#define BLKTAP_MODE_INTERCEPT_BE     0x00000002  /* unimp. */
+#define BLKTAP_MODE_COPY_FE          0x00000004  /* unimp. */
+#define BLKTAP_MODE_COPY_BE          0x00000008  /* unimp. */
+#define BLKTAP_MODE_COPY_FE_PAGES    0x00000010  /* unimp. */
+#define BLKTAP_MODE_COPY_BE_PAGES    0x00000020  /* unimp. */
+
+#define BLKTAP_MODE_INTERPOSE \
+           (BLKTAP_MODE_INTERCEPT_FE | BLKTAP_MODE_INTERCEPT_BE)
+
+#define BLKTAP_MODE_COPY_BOTH \
+           (BLKTAP_MODE_COPY_FE | BLKTAP_MODE_COPY_BE)
+
+#define BLKTAP_MODE_COPY_BOTH_PAGES \
+           (BLKTAP_MODE_COPY_FE_PAGES | BLKTAP_MODE_COPY_BE_PAGES)
 
-int __init xlblktap_init(void)
+static inline int BLKTAP_MODE_VALID(unsigned long arg)
 {
-    ctrl_msg_t               cmsg;
-    blkif_fe_driver_status_t fe_st;
-    blkif_be_driver_status_t be_st;
+    return (
+        ( arg == BLKTAP_MODE_PASSTHROUGH  ) ||
+        ( arg == BLKTAP_MODE_INTERCEPT_FE ) ||
+        ( arg == BLKTAP_MODE_INTERPOSE    ) );
+/*
+    return (
+        ( arg == BLKTAP_MODE_PASSTHROUGH  ) ||
+        ( arg == BLKTAP_MODE_INTERCEPT_FE ) ||
+        ( arg == BLKTAP_MODE_INTERCEPT_BE ) ||
+        ( arg == BLKTAP_MODE_INTERPOSE    ) ||
+        ( (arg & ~BLKTAP_MODE_COPY_FE_PAGES) == BLKTAP_MODE_COPY_FE ) ||
+        ( (arg & ~BLKTAP_MODE_COPY_BE_PAGES) == BLKTAP_MODE_COPY_BE ) ||
+        ( (arg & ~BLKTAP_MODE_COPY_BOTH_PAGES) == BLKTAP_MODE_COPY_BOTH )
+        );
+*/
+}
+
+
+/******************************************************************
+ * MMAP REGION
+ */
+
+/*
+ * We use a big chunk of address space to map in-flight requests into,
+ * and export this region up to user-space.  See the comments in blkback
+ * about this -- the two must be kept in sync if the tap is used as a 
+ * passthrough.
+ */
+
+#define MAX_PENDING_REQS 64
+#define BATCH_PER_DOMAIN 16
 
-    printk(KERN_INFO "Initialising Xen block tap device\n");
-#ifdef CONFIG_XEN_BLKDEV_GRANT
-    printk(KERN_INFO "Block tap is using grant tables.\n");
-#endif
+/* immediately before the mmap area, we have a bunch of pages reserved
+ * for shared memory rings.
+ */
+#define RING_PAGES 1 /* Front */ 
+
+/* Where things are inside the device mapping. */
+struct vm_area_struct *blktap_vma = NULL;
+unsigned long mmap_vstart;  /* Kernel pages for mapping in data. */
+unsigned long rings_vstart; /* start of mmaped vma               */
+unsigned long user_vstart;  /* start of user mappings            */
 
-    DPRINTK("   tap - Backend connection init:\n");
+#define MMAP_PAGES                                              \
+    (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
+#define MMAP_VADDR(_start, _req,_seg)                           \
+    (_start +                                                   \
+     ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) +    \
+     ((_seg) * PAGE_SIZE))
 
 
-    (void)ctrl_if_register_receiver(CMSG_BLKIF_FE, blkif_ctrlif_rx,
-                                    CALLBACK_IN_BLOCKING_CONTEXT);
 
-    /* Send a driver-UP notification to the domain controller. */
-    cmsg.type      = CMSG_BLKIF_FE;
-    cmsg.subtype   = CMSG_BLKIF_FE_DRIVER_STATUS;
-    cmsg.length    = sizeof(blkif_fe_driver_status_t);
-    fe_st.status   = BLKIF_DRIVER_STATUS_UP;
-    memcpy(cmsg.msg, &fe_st, sizeof(fe_st));
-    ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
+/*
+ * Each outstanding request that we've passed to the lower device layers has a 
+ * 'pending_req' allocated to it. Each buffer_head that completes decrements 
+ * the pendcnt towards zero. When it hits zero, the specified domain has a 
+ * response queued for it, with the saved 'id' passed back.
+ */
+typedef struct {
+    blkif_t       *blkif;
+    unsigned long  id;
+    int            nr_pages;
+    atomic_t       pendcnt;
+    unsigned short operation;
+    int            status;
+} pending_req_t;
+
+/*
+ * We can't allocate pending_req's in order, since they may complete out of 
+ * order. We therefore maintain an allocation ring. This ring also indicates 
+ * when enough work has been passed down -- at that point the allocation ring 
+ * will be empty.
+ */
+static pending_req_t pending_reqs[MAX_PENDING_REQS];
+static unsigned char pending_ring[MAX_PENDING_REQS];
+static spinlock_t pend_prod_lock = SPIN_LOCK_UNLOCKED;
+/* NB. We use a different index type to differentiate from shared blk rings. */
+typedef unsigned int PEND_RING_IDX;
+#define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1))
+static PEND_RING_IDX pending_prod, pending_cons;
+#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
+
+/* Requests passing through the tap to the backend hijack the id field
+ * in the request message.  In it we put the AR index _AND_ the fe domid.
+ * the domid is used by the backend to map the pages properly.
+ */
 
-    DPRINTK("   tap - Frontend connection init:\n");
+static inline unsigned long MAKE_ID(domid_t fe_dom, PEND_RING_IDX idx)
+{
+    return ( (fe_dom << 16) | MASK_PEND_IDX(idx) );
+}
+
+extern inline PEND_RING_IDX ID_TO_IDX(unsigned long id) 
+{ 
+    return (PEND_RING_IDX)( id & 0x0000ffff );
+}
+
+extern inline domid_t ID_TO_DOM(unsigned long id) 
+{ 
+    return (domid_t)(id >> 16); 
+}
+
+
+
+/******************************************************************
+ * GRANT HANDLES
+ */
+
+/* When using grant tables to map a frame for device access then the
+ * handle returned must be used to unmap the frame. This is needed to
+ * drop the ref count on the frame.
+ */
+struct grant_handle_pair
+{
+    u16  kernel;
+    u16  user;
+};
+static struct grant_handle_pair pending_grant_handles[MMAP_PAGES];
+#define pending_handle(_idx, _i) \
+    (pending_grant_handles[((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) + (_i)])
+#define BLKTAP_INVALID_HANDLE(_g) \
+    (((_g->kernel) == 0xFFFF) && ((_g->user) == 0xFFFF))
+#define BLKTAP_INVALIDATE_HANDLE(_g) do {       \
+    (_g)->kernel = 0xFFFF; (_g)->user = 0xFFFF; \
+    } while(0)
+
+
+/******************************************************************
+ * BLKTAP VM OPS
+ */
+
+static struct page *blktap_nopage(struct vm_area_struct *vma,
+                                             unsigned long address,
+                                             int *type)
+{
+    /*
+     * if the page has not been mapped in by the driver then generate
+     * a SIGBUS to the domain.
+     */
+
+    force_sig(SIGBUS, current);
+
+    return 0;
+}
+
+struct vm_operations_struct blktap_vm_ops = {
+    nopage:   blktap_nopage,
+};
+
+/******************************************************************
+ * BLKTAP FILE OPS
+ */
+
+static int blktap_open(struct inode *inode, struct file *filp)
+{
+    blkif_sring_t *sring;
     
-    active_reqs_init();
-    blkif_interface_init();
-    blkdev_schedule_init();
+    if ( test_and_set_bit(0, &blktap_dev_inuse) )
+        return -EBUSY;
+    
+    /* Allocate the fe ring. */
+    sring = (blkif_sring_t *)get_zeroed_page(GFP_KERNEL);
+    if (sring == NULL)
+        goto fail_nomem;
+
+    SetPageReserved(virt_to_page(sring));
     
-    (void)ctrl_if_register_receiver(CMSG_BLKIF_BE, blkif_ctrlif_rx, 
-                                    CALLBACK_IN_BLOCKING_CONTEXT);
+    SHARED_RING_INIT(sring);
+    FRONT_RING_INIT(&blktap_ufe_ring, sring, PAGE_SIZE);
+
+    return 0;
+
+ fail_nomem:
+    return -ENOMEM;
+}
+
+static int blktap_release(struct inode *inode, struct file *filp)
+{
+    blktap_dev_inuse = 0;
+    blktap_ring_ok = 0;
+
+    /* Free the ring page. */
+    ClearPageReserved(virt_to_page(blktap_ufe_ring.sring));
+    free_page((unsigned long) blktap_ufe_ring.sring);
+
+    /* Clear any active mappings and free foreign map table */
+    if (blktap_vma != NULL) {
+        zap_page_range(blktap_vma, blktap_vma->vm_start, 
+                       blktap_vma->vm_end - blktap_vma->vm_start, NULL);
+        blktap_vma = NULL;
+    }
+
+    return 0;
+}
+
+
+/* Note on mmap:
+ * We need to map pages to user space in a way that will allow the block
+ * subsystem set up direct IO to them.  This couldn't be done before, because
+ * there isn't really a sane way to translate a user virtual address down to a 
+ * physical address when the page belongs to another domain.
+ *
+ * My first approach was to map the page in to kernel memory, add an entry
+ * for it in the physical frame list (using alloc_lomem_region as in blkback)
+ * and then attempt to map that page up to user space.  This is disallowed
+ * by xen though, which realizes that we don't really own the machine frame
+ * underlying the physical page.
+ *
+ * The new approach is to provide explicit support for this in xen linux.
+ * The VMA now has a flag, VM_FOREIGN, to indicate that it contains pages
+ * mapped from other vms.  vma->vm_private_data is set up as a mapping 
+ * from pages to actual page structs.  There is a new clause in get_user_pages
+ * that does the right thing for this sort of mapping.
+ */
+static int blktap_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+    int size;
+    struct page **map;
+    int i;
+
+    DPRINTK(KERN_ALERT "blktap mmap (%lx, %lx)\n",
+           vma->vm_start, vma->vm_end);
+
+    vma->vm_flags |= VM_RESERVED;
+    vma->vm_ops = &blktap_vm_ops;
+
+    size = vma->vm_end - vma->vm_start;
+    if ( size != ( (MMAP_PAGES + RING_PAGES) << PAGE_SHIFT ) ) {
+        printk(KERN_INFO 
+               "blktap: you _must_ map exactly %d pages!\n",
+               MMAP_PAGES + RING_PAGES);
+        return -EAGAIN;
+    }
+
+    size >>= PAGE_SHIFT;
+    DPRINTK(KERN_INFO "blktap: 2 rings + %d pages.\n", size-1);
+    
+    rings_vstart = vma->vm_start;
+    user_vstart  = rings_vstart + (RING_PAGES << PAGE_SHIFT);
+    
+    /* Map the ring pages to the start of the region and reserve it. */
+
+    /* not sure if I really need to do this... */
+    vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 
-    /* Send a driver-UP notification to the domain controller. */
-    cmsg.type      = CMSG_BLKIF_BE;
-    cmsg.subtype   = CMSG_BLKIF_BE_DRIVER_STATUS;
-    cmsg.length    = sizeof(blkif_be_driver_status_t);
-    be_st.status   = BLKIF_DRIVER_STATUS_UP;
-    memcpy(cmsg.msg, &be_st, sizeof(be_st));
-    ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
+    if (remap_pfn_range(vma, vma->vm_start, 
+                         __pa(blktap_ufe_ring.sring) >> PAGE_SHIFT, 
+                         PAGE_SIZE, vma->vm_page_prot)) 
+    {
+        WPRINTK("Mapping user ring failed!\n");
+        goto fail;
+    }
 
-    DPRINTK("   tap - Userland channel init:\n");
+    /* Mark this VM as containing foreign pages, and set up mappings. */
+    map = kmalloc(((vma->vm_end - vma->vm_start) >> PAGE_SHIFT)
+                  * sizeof(struct page_struct*),
+                  GFP_KERNEL);
+    if (map == NULL) 
+    {
+        WPRINTK("Couldn't alloc VM_FOREIGH map.\n");
+        goto fail;
+    }
 
-    blktap_init();
+    for (i=0; i<((vma->vm_end - vma->vm_start) >> PAGE_SHIFT); i++)
+        map[i] = NULL;
+    
+    vma->vm_private_data = map;
+    vma->vm_flags |= VM_FOREIGN;
 
-    DPRINTK("Blkif tap device initialized.\n");
+    blktap_vma = vma;
+    blktap_ring_ok = 1;
 
     return 0;
+ fail:
+    /* Clear any active mappings. */
+    zap_page_range(vma, vma->vm_start, 
+                   vma->vm_end - vma->vm_start, NULL);
+
+    return -ENOMEM;
 }
 
-#if 0 /* tap doesn't handle suspend/resume */
-void blkdev_suspend(void)
+static int blktap_ioctl(struct inode *inode, struct file *filp,
+                        unsigned int cmd, unsigned long arg)
 {
+    switch(cmd) {
+    case BLKTAP_IOCTL_KICK_FE: /* There are fe messages to process. */
+        return blktap_read_ufe_ring();
+
+    case BLKTAP_IOCTL_SETMODE:
+        if (BLKTAP_MODE_VALID(arg)) {
+            blktap_mode = arg;
+            /* XXX: may need to flush rings here. */
+            printk(KERN_INFO "blktap: set mode to %lx\n", arg);
+            return 0;
+        }
+    case BLKTAP_IOCTL_PRINT_IDXS:
+        {
+            //print_fe_ring_idxs();
+            WPRINTK("User Rings: \n-----------\n");
+            WPRINTK("UF: rsp_cons: %2d, req_prod_prv: %2d "
+                            "| req_prod: %2d, rsp_prod: %2d\n",
+                            blktap_ufe_ring.rsp_cons,
+                            blktap_ufe_ring.req_prod_pvt,
+                            blktap_ufe_ring.sring->req_prod,
+                            blktap_ufe_ring.sring->rsp_prod);
+            
+        }
+    }
+    return -ENOIOCTLCMD;
+}
+
+static unsigned int blktap_poll(struct file *file, poll_table *wait)
+{
+        poll_wait(file, &blktap_wait, wait);
+        if ( RING_HAS_UNPUSHED_REQUESTS(&blktap_ufe_ring) ) 
+        {
+            flush_tlb_all();
+
+            RING_PUSH_REQUESTS(&blktap_ufe_ring);
+            return POLLIN | POLLRDNORM;
+        }
+
+        return 0;
 }
 
-void blkdev_resume(void)
+void blktap_kick_user(void)
 {
-    ctrl_msg_t               cmsg;
-    blkif_fe_driver_status_t st;    
+    /* blktap_ring->req_prod = blktap_req_prod; */
+    wake_up_interruptible(&blktap_wait);
+}
+
+static struct file_operations blktap_fops = {
+    owner:    THIS_MODULE,
+    poll:     blktap_poll,
+    ioctl:    blktap_ioctl,
+    open:     blktap_open,
+    release:  blktap_release,
+    mmap:     blktap_mmap,
+};
+
+
+
+static int do_block_io_op(blkif_t *blkif, int max_to_do);
+static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req);
+static void make_response(blkif_t *blkif, unsigned long id, 
+                          unsigned short op, int st);
+
+
+static void fast_flush_area(int idx, int nr_pages)
+{
+    struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
+    unsigned int i, op = 0;
+    struct grant_handle_pair *handle;
+    unsigned long ptep;
+
+    for (i=0; i<nr_pages; i++)
+    {
+        handle = &pending_handle(idx, i);
+        if (!BLKTAP_INVALID_HANDLE(handle))
+        {
+
+            unmap[op].host_addr = MMAP_VADDR(mmap_vstart, idx, i);
+            unmap[op].dev_bus_addr = 0;
+            unmap[op].handle = handle->kernel;
+            op++;
+
+            if (create_lookup_pte_addr(blktap_vma->vm_mm,
+                                       MMAP_VADDR(user_vstart, idx, i), 
+                                       &ptep) !=0) {
+                DPRINTK("Couldn't get a pte addr!\n");
+                return;
+            }
+            unmap[op].host_addr    = ptep;
+            unmap[op].dev_bus_addr = 0;
+            unmap[op].handle       = handle->user;
+            op++;
+            
+           BLKTAP_INVALIDATE_HANDLE(handle);
+        }
+    }
+    if ( unlikely(HYPERVISOR_grant_table_op(
+        GNTTABOP_unmap_grant_ref, unmap, op)))
+        BUG();
+
+    if (blktap_vma != NULL)
+        zap_page_range(blktap_vma, 
+                       MMAP_VADDR(user_vstart, idx, 0), 
+                       nr_pages << PAGE_SHIFT, NULL);
+}
+
+/******************************************************************
+ * BLOCK-DEVICE SCHEDULER LIST MAINTENANCE
+ */
+
+static struct list_head blkio_schedule_list;
+static spinlock_t blkio_schedule_list_lock;
+
+static int __on_blkdev_list(blkif_t *blkif)
+{
+    return blkif->blkdev_list.next != NULL;
+}
 
-    /* Send a driver-UP notification to the domain controller. */
-    cmsg.type      = CMSG_BLKIF_FE;
-    cmsg.subtype   = CMSG_BLKIF_FE_DRIVER_STATUS;
-    cmsg.length    = sizeof(blkif_fe_driver_status_t);
-    st.status      = BLKIF_DRIVER_STATUS_UP;
-    memcpy(cmsg.msg, &st, sizeof(st));
-    ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
+static void remove_from_blkdev_list(blkif_t *blkif)
+{
+    unsigned long flags;
+    if ( !__on_blkdev_list(blkif) ) return;
+    spin_lock_irqsave(&blkio_schedule_list_lock, flags);
+    if ( __on_blkdev_list(blkif) )
+    {
+        list_del(&blkif->blkdev_list);
+        blkif->blkdev_list.next = NULL;
+        blkif_put(blkif);
+    }
+    spin_unlock_irqrestore(&blkio_schedule_list_lock, flags);
+}
+
+static void add_to_blkdev_list_tail(blkif_t *blkif)
+{
+    unsigned long flags;
+    if ( __on_blkdev_list(blkif) ) return;
+    spin_lock_irqsave(&blkio_schedule_list_lock, flags);
+    if ( !__on_blkdev_list(blkif) && (blkif->status == CONNECTED) )
+    {
+        list_add_tail(&blkif->blkdev_list, &blkio_schedule_list);
+        blkif_get(blkif);
+    }
+    spin_unlock_irqrestore(&blkio_schedule_list_lock, flags);
+}
+
+
+/******************************************************************
+ * SCHEDULER FUNCTIONS
+ */
+
+static DECLARE_WAIT_QUEUE_HEAD(blkio_schedule_wait);
+
+static int blkio_schedule(void *arg)
+{
+    DECLARE_WAITQUEUE(wq, current);
+
+    blkif_t          *blkif;
+    struct list_head *ent;
+
+    daemonize("xenblkd");
+
+    for ( ; ; )
+    {
+        /* Wait for work to do. */
+        add_wait_queue(&blkio_schedule_wait, &wq);
+        set_current_state(TASK_INTERRUPTIBLE);
+        if ( (NR_PENDING_REQS == MAX_PENDING_REQS) || 
+             list_empty(&blkio_schedule_list) )
+            schedule();
+        __set_current_state(TASK_RUNNING);
+        remove_wait_queue(&blkio_schedule_wait, &wq);
+
+        /* Queue up a batch of requests. */
+        while ( (NR_PENDING_REQS < MAX_PENDING_REQS) &&
+                !list_empty(&blkio_schedule_list) )
+        {
+            ent = blkio_schedule_list.next;
+            blkif = list_entry(ent, blkif_t, blkdev_list);
+            blkif_get(blkif);
+            remove_from_blkdev_list(blkif);
+            if ( do_block_io_op(blkif, BATCH_PER_DOMAIN) )
+                add_to_blkdev_list_tail(blkif);
+            blkif_put(blkif);
+        }
+    }
+}
+
+static void maybe_trigger_blkio_schedule(void)
+{
+    /*
+     * Needed so that two processes, who together make the following predicate
+     * true, don't both read stale values and evaluate the predicate
+     * incorrectly. Incredibly unlikely to stall the scheduler on x86, but...
+     */
+    smp_mb();
+
+    if ( (NR_PENDING_REQS < (MAX_PENDING_REQS/2)) &&
+         !list_empty(&blkio_schedule_list) )
+        wake_up(&blkio_schedule_wait);
+}
+
+
+
+/******************************************************************
+ * COMPLETION CALLBACK -- Called as bh->b_end_io()
+ */
+
+
+static int blktap_read_ufe_ring(void)
+{
+    /* This is called to read responses from the UFE ring. */
+
+    RING_IDX i, j, rp;
+    blkif_response_t *resp;
+    blkif_t *blkif;
+    int pending_idx;
+    pending_req_t *pending_req;
+    unsigned long     flags;
+
+    /* if we are forwarding from UFERring to FERing */
+    if (blktap_mode & BLKTAP_MODE_INTERCEPT_FE) {
+
+        /* for each outstanding message on the UFEring  */
+        rp = blktap_ufe_ring.sring->rsp_prod;
+        rmb();
+        
+        for ( i = blktap_ufe_ring.rsp_cons; i != rp; i++ )
+        {
+            resp = RING_GET_RESPONSE(&blktap_ufe_ring, i);
+            pending_idx = MASK_PEND_IDX(ID_TO_IDX(resp->id));
+            pending_req = &pending_reqs[pending_idx];
+            
+            blkif = pending_req->blkif;
+            for (j = 0; j < pending_req->nr_pages; j++) {
+                unsigned long vaddr;
+                struct page **map = blktap_vma->vm_private_data;
+                int offset; 
+
+                vaddr  = MMAP_VADDR(user_vstart, pending_idx, j);
+                offset = (vaddr - blktap_vma->vm_start) >> PAGE_SHIFT;
+
+                //ClearPageReserved(virt_to_page(vaddr));
+                ClearPageReserved((struct page *)map[offset]);
+                map[offset] = NULL;
+            }
+
+            fast_flush_area(pending_idx, pending_req->nr_pages);
+            make_response(blkif, pending_req->id, resp->operation, 
+                          resp->status);
+            blkif_put(pending_req->blkif);
+            spin_lock_irqsave(&pend_prod_lock, flags);
+            pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
+            spin_unlock_irqrestore(&pend_prod_lock, flags);
+        }
+        blktap_ufe_ring.rsp_cons = i;
+        maybe_trigger_blkio_schedule();
+    }
+    return 0;
+}
+
+
+/******************************************************************************
+ * NOTIFICATION FROM GUEST OS.
+ */
+
+irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs)
+{
+    blkif_t *blkif = dev_id;
+    add_to_blkdev_list_tail(blkif);
+    maybe_trigger_blkio_schedule();
+    return IRQ_HANDLED;
+}
+
+
+
+/******************************************************************
+ * DOWNWARD CALLS -- These interface with the block-device layer proper.
+ */
+
+static int do_block_io_op(blkif_t *blkif, int max_to_do)
+{
+    blkif_back_ring_t *blk_ring = &blkif->blk_ring;
+    blkif_request_t *req;
+    RING_IDX i, rp;
+    int more_to_do = 0;
+    
+    rp = blk_ring->sring->req_prod;
+    rmb(); /* Ensure we see queued requests up to 'rp'. */
+
+    for ( i = blk_ring->req_cons; 
+         (i != rp) && !RING_REQUEST_CONS_OVERFLOW(blk_ring, i);
+          i++ )
+    {
+        if ( (max_to_do-- == 0) || (NR_PENDING_REQS == MAX_PENDING_REQS) )
+        {
+            more_to_do = 1;
+            break;
+        }
+        
+        req = RING_GET_REQUEST(blk_ring, i);
+        switch ( req->operation )
+        {
+        case BLKIF_OP_READ:
+        case BLKIF_OP_WRITE:
+            dispatch_rw_block_io(blkif, req);
+            break;
+
+        default:
+            DPRINTK("error: unknown block io operation [%d]\n",
+                    req->operation);
+            make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
+            break;
+        }
+    }
+
+    blk_ring->req_cons = i;
+    blktap_kick_user();
+
+    return more_to_do;
+}
+
+static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req)
+{
+    blkif_request_t *target;
+    int i, pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
+    pending_req_t *pending_req;
+    struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
+    int op, ret;
+    unsigned int nseg;
+
+    /* Check that number of segments is sane. */
+    nseg = req->nr_segments;
+    if ( unlikely(nseg == 0) || 
+         unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) )
+    {
+        DPRINTK("Bad number of segments in request (%d)\n", nseg);
+        goto bad_descriptor;
+    }
+
+    /* Make sure userspace is ready. */
+    if (!blktap_ring_ok) {
+        DPRINTK("blktap: ring not ready for requests!\n");
+        goto bad_descriptor;
+    }
+    
+
+    if ( RING_FULL(&blktap_ufe_ring) ) {
+        WPRINTK("blktap: fe_ring is full, can't add (very broken!).\n");
+        goto bad_descriptor;
+    }
+
+    flush_cache_all(); /* a noop on intel... */
+
+    /* Map the foreign pages directly in to the application */    
+    op = 0;
+    for (i=0; i<req->nr_segments; i++) {
+
+        unsigned long uvaddr;
+        unsigned long kvaddr;
+        unsigned long ptep;
+
+        uvaddr = MMAP_VADDR(user_vstart, pending_idx, i);
+        kvaddr = MMAP_VADDR(mmap_vstart, pending_idx, i);
+
+        /* Map the remote page to kernel. */
+        map[op].host_addr = kvaddr;
+        map[op].dom   = blkif->domid;
+        map[op].ref   = blkif_gref_from_fas(req->frame_and_sects[i]);
+        map[op].flags = GNTMAP_host_map;
+        /* This needs a bit more thought in terms of interposition: 
+         * If we want to be able to modify pages during write using 
+         * grant table mappings, the guest will either need to allow 
+         * it, or we'll need to incur a copy. Bit of an fbufs moment. ;) */
+        if (req->operation == BLKIF_OP_WRITE)
+            map[op].flags |= GNTMAP_readonly;
+        op++;
+
+        /* Now map it to user. */
+        ret = create_lookup_pte_addr(blktap_vma->vm_mm, uvaddr, &ptep);
+        if (ret)
+        {
+            DPRINTK("Couldn't get a pte addr!\n");
+            fast_flush_area(pending_idx, req->nr_segments);
+            goto bad_descriptor;
+        }
+
+        map[op].host_addr = ptep;
+        map[op].dom       = blkif->domid;
+        map[op].ref       = blkif_gref_from_fas(req->frame_and_sects[i]);
+        map[op].flags     = GNTMAP_host_map | GNTMAP_application_map
+                            | GNTMAP_contains_pte;
+        /* Above interposition comment applies here as well. */
+        if (req->operation == BLKIF_OP_WRITE)
+            map[op].flags |= GNTMAP_readonly;
+        op++;
+    }
+
+    if ( unlikely(HYPERVISOR_grant_table_op(
+            GNTTABOP_map_grant_ref, map, op)))
+        BUG();
+
+    op = 0;
+    for (i=0; i<(req->nr_segments*2); i+=2) {
+        unsigned long uvaddr;
+        unsigned long kvaddr;
+        unsigned long offset;
+        int cancel = 0;
+
+        uvaddr = MMAP_VADDR(user_vstart, pending_idx, i/2);
+        kvaddr = MMAP_VADDR(mmap_vstart, pending_idx, i/2);
+
+        if ( unlikely(map[i].handle < 0) ) 
+        {
+            DPRINTK("Error on kernel grant mapping (%d)\n", map[i].handle);
+            ret = map[i].handle;
+            cancel = 1;
+        }
+
+        if ( unlikely(map[i+1].handle < 0) ) 
+        {
+            DPRINTK("Error on user grant mapping (%d)\n", map[i+1].handle);
+            ret = map[i+1].handle;
+            cancel = 1;
+        }
+
+        if (cancel) 
+        {
+            fast_flush_area(pending_idx, req->nr_segments);
+            goto bad_descriptor;
+        }
+
+        /* Set the necessary mappings in p2m and in the VM_FOREIGN 
+         * vm_area_struct to allow user vaddr -> struct page lookups
+         * to work.  This is needed for direct IO to foreign pages. */
+        phys_to_machine_mapping[__pa(kvaddr) >> PAGE_SHIFT] =
+            FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT);
+
+        offset = (uvaddr - blktap_vma->vm_start) >> PAGE_SHIFT;
+        ((struct page **)blktap_vma->vm_private_data)[offset] =
+            pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
+
+        /* Save handles for unmapping later. */
+        pending_handle(pending_idx, i/2).kernel = map[i].handle;
+        pending_handle(pending_idx, i/2).user   = map[i+1].handle;
+    }
+
+    /* Mark mapped pages as reserved: */
+    for ( i = 0; i < req->nr_segments; i++ )
+    {
+        unsigned long kvaddr;
+
+        kvaddr = MMAP_VADDR(mmap_vstart, pending_idx, i);
+        SetPageReserved(pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT));
+    }
+
+    pending_req = &pending_reqs[pending_idx];
+    pending_req->blkif     = blkif;
+    pending_req->id        = req->id;
+    pending_req->operation = req->operation;
+    pending_req->status    = BLKIF_RSP_OKAY;
+    pending_req->nr_pages  = nseg;
+    req->id = MAKE_ID(blkif->domid, pending_idx);
+    //atomic_set(&pending_req->pendcnt, nbio);
+    pending_cons++;
+    blkif_get(blkif);
+
+    /* Finally, write the request message to the user ring. */
+    target = RING_GET_REQUEST(&blktap_ufe_ring, blktap_ufe_ring.req_prod_pvt);
+    memcpy(target, req, sizeof(*req));
+    blktap_ufe_ring.req_prod_pvt++;
+    return;
+
+ bad_descriptor:
+    make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
+} 
+
+
+
+/******************************************************************
+ * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
+ */
+
+
+static void make_response(blkif_t *blkif, unsigned long id, 
+                          unsigned short op, int st)
+{
+    blkif_response_t *resp;
+    unsigned long     flags;
+    blkif_back_ring_t *blk_ring = &blkif->blk_ring;
+
+    /* Place on the response ring for the relevant domain. */ 
+    spin_lock_irqsave(&blkif->blk_ring_lock, flags);
+    resp = RING_GET_RESPONSE(blk_ring, blk_ring->rsp_prod_pvt);
+    resp->id        = id;
+    resp->operation = op;
+    resp->status    = st;
+    wmb(); /* Ensure other side can see the response fields. */
+    blk_ring->rsp_prod_pvt++;
+    RING_PUSH_RESPONSES(blk_ring);
+    spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
+
+    /* Kick the relevant domain. */
+    notify_via_evtchn(blkif->evtchn);
+}
+
+static struct miscdevice blktap_miscdev = {
+    .minor        = BLKTAP_MINOR,
+    .name         = "blktap",
+    .fops         = &blktap_fops,
+    .devfs_name   = "misc/blktap",
+};
+
+void blkif_deschedule(blkif_t *blkif)
+{
+    remove_from_blkdev_list(blkif);
+}
+
+static int __init blkif_init(void)
+{
+    int i, j, err;
+    struct page *page;
+/*
+    if ( !(xen_start_info.flags & SIF_INITDOMAIN) &&
+         !(xen_start_info.flags & SIF_BLK_BE_DOMAIN) )
+        return 0;
+*/
+    blkif_interface_init();
+
+    page = balloon_alloc_empty_page_range(MMAP_PAGES);
+    BUG_ON(page == NULL);
+    mmap_vstart = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
+
+    pending_cons = 0;
+    pending_prod = MAX_PENDING_REQS;
+    memset(pending_reqs, 0, sizeof(pending_reqs));
+    for ( i = 0; i < MAX_PENDING_REQS; i++ )
+        pending_ring[i] = i;
+    
+    spin_lock_init(&blkio_schedule_list_lock);
+    INIT_LIST_HEAD(&blkio_schedule_list);
+
+    if ( kernel_thread(blkio_schedule, 0, CLONE_FS | CLONE_FILES) < 0 )
+        BUG();
+
+    blkif_xenbus_init();
+
+    for (i=0; i<MAX_PENDING_REQS ; i++)
+        for (j=0; j<BLKIF_MAX_SEGMENTS_PER_REQUEST; j++)
+            BLKTAP_INVALIDATE_HANDLE(&pending_handle(i, j));
+
+    err = misc_register(&blktap_miscdev);
+    if ( err != 0 )
+    {
+        printk(KERN_ALERT "Couldn't register /dev/misc/blktap (%d)\n", err);
+        return err;
+    }
+
+    init_waitqueue_head(&blktap_wait);
+
+    return 0;
 }
-#endif
 
-__initcall(xlblktap_init);
+__initcall(blkif_init);
diff --git a/linux-2.6-xen-sparse/drivers/xen/blktap/blktap_controlmsg.c b/linux-2.6-xen-sparse/drivers/xen/blktap/blktap_controlmsg.c
deleted file mode 100644 (file)
index dd39aff..0000000
+++ /dev/null
@@ -1,573 +0,0 @@
-/******************************************************************************
- * blktap_controlmsg.c
- * 
- * XenLinux virtual block-device tap.
- * Control interfaces to the frontend and backend drivers.
- * 
- * Copyright (c) 2004, Andrew Warfield
- *
- */
-#include "blktap.h"
-#include <asm-xen/evtchn.h>
-
-static char *blkif_state_name[] = {
-    [BLKIF_STATE_CLOSED]       = "closed",
-    [BLKIF_STATE_DISCONNECTED] = "disconnected",
-    [BLKIF_STATE_CONNECTED]    = "connected",
-};
-
-static char *blkif_status_name[] = {
-    [BLKIF_INTERFACE_STATUS_CLOSED]       = "closed",
-    [BLKIF_INTERFACE_STATUS_DISCONNECTED] = "disconnected",
-    [BLKIF_INTERFACE_STATUS_CONNECTED]    = "connected",
-    [BLKIF_INTERFACE_STATUS_CHANGED]      = "changed",
-};
-
-unsigned int blktap_be_state = BLKIF_STATE_CLOSED;
-unsigned int blktap_be_evtchn;
-
-/*-----[ Control Messages to/from Frontend VMs ]--------------------------*/
-
-#define BLKIF_HASHSZ 1024
-#define BLKIF_HASH(_d,_h) (((int)(_d)^(int)(_h))&(BLKIF_HASHSZ-1))
-
-static kmem_cache_t *blkif_cachep;
-static blkif_t      *blkif_hash[BLKIF_HASHSZ];
-
-blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle)
-{
-    blkif_t *blkif = blkif_hash[BLKIF_HASH(domid, handle)];
-    while ( (blkif != NULL) && 
-            ((blkif->domid != domid) || (blkif->handle != handle)) )
-        blkif = blkif->hash_next;
-    return blkif;
-}
-
-static void __blkif_disconnect_complete(void *arg)
-{
-    blkif_t              *blkif = (blkif_t *)arg;
-    ctrl_msg_t            cmsg;
-    blkif_be_disconnect_t disc;
-#ifdef CONFIG_XEN_BLKDEV_GRANT
-    struct gnttab_unmap_grant_ref op;
-#endif
-
-    /*
-     * These can't be done in blkif_disconnect() because at that point there
-     * may be outstanding requests at the disc whose asynchronous responses
-     * must still be notified to the remote driver.
-     */
-#ifdef CONFIG_XEN_BLKDEV_GRANT
-    op.host_addr = blkif->shmem_vaddr;
-    op.handle         = blkif->shmem_handle;
-    op.dev_bus_addr   = 0;
-    BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1));
-#endif
-    vfree(blkif->blk_ring.sring);
-
-    /* Construct the deferred response message. */
-    cmsg.type         = CMSG_BLKIF_BE;
-    cmsg.subtype      = CMSG_BLKIF_BE_DISCONNECT;
-    cmsg.id           = blkif->disconnect_rspid;
-    cmsg.length       = sizeof(blkif_be_disconnect_t);
-    disc.domid        = blkif->domid;
-    disc.blkif_handle = blkif->handle;
-    disc.status       = BLKIF_BE_STATUS_OKAY;
-    memcpy(cmsg.msg, &disc, sizeof(disc));
-
-    /*
-     * Make sure message is constructed /before/ status change, because
-     * after the status change the 'blkif' structure could be deallocated at
-     * any time. Also make sure we send the response /after/ status change,
-     * as otherwise a subsequent CONNECT request could spuriously fail if
-     * another CPU doesn't see the status change yet.
-     */
-    mb();
-    if ( blkif->status != DISCONNECTING )
-        BUG();
-    blkif->status = DISCONNECTED;
-    mb();
-
-    /* Send the successful response. */
-    ctrl_if_send_response(&cmsg);
-}
-
-void blkif_disconnect_complete(blkif_t *blkif)
-{
-    INIT_WORK(&blkif->work, __blkif_disconnect_complete, (void *)blkif);
-    schedule_work(&blkif->work);
-}
-
-void blkif_ptfe_create(blkif_be_create_t *create)
-{
-    blkif_t      *blkif, **pblkif;
-    domid_t       domid  = create->domid;
-    unsigned int  handle = create->blkif_handle;
-
-
-    /* May want to store info on the connecting domain here. */
-
-    DPRINTK("PT got BE_CREATE\n");
-
-    if ( (blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL)) == NULL )
-    {
-        WPRINTK("Could not create blkif: out of memory\n");
-        create->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
-        return;
-    }
-
-    /* blkif struct init code from blkback.c */
-    memset(blkif, 0, sizeof(*blkif));
-    blkif->domid  = domid;
-    blkif->handle = handle;
-    blkif->status = DISCONNECTED;  
-    spin_lock_init(&blkif->blk_ring_lock);
-    atomic_set(&blkif->refcnt, 0);
-
-    pblkif = &blkif_hash[BLKIF_HASH(domid, handle)];
-    while ( *pblkif != NULL )
-    {
-        if ( ((*pblkif)->domid == domid) && ((*pblkif)->handle == handle) )
-        {
-            WPRINTK("Could not create blkif: already exists\n");
-            create->status = BLKIF_BE_STATUS_INTERFACE_EXISTS;
-            kmem_cache_free(blkif_cachep, blkif);
-            return;
-        }
-        pblkif = &(*pblkif)->hash_next;
-    }
-
-    blkif->hash_next = *pblkif;
-    *pblkif = blkif;
-
-    create->status = BLKIF_BE_STATUS_OKAY;
-}
-
-
-void blkif_ptfe_destroy(blkif_be_destroy_t *destroy)
-{
-    /* Clear anything that we initialized above. */
-
-    domid_t       domid  = destroy->domid;
-    unsigned int  handle = destroy->blkif_handle;
-    blkif_t     **pblkif, *blkif;
-
-    DPRINTK("PT got BE_DESTROY\n");
-    
-    pblkif = &blkif_hash[BLKIF_HASH(domid, handle)];
-    while ( (blkif = *pblkif) != NULL )
-    {
-        if ( (blkif->domid == domid) && (blkif->handle == handle) )
-        {
-            if ( blkif->status != DISCONNECTED )
-                goto still_connected;
-            goto destroy;
-        }
-        pblkif = &blkif->hash_next;
-    }
-
-    destroy->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
-    return;
-
- still_connected:
-    destroy->status = BLKIF_BE_STATUS_INTERFACE_CONNECTED;
-    return;
-
- destroy:
-    *pblkif = blkif->hash_next;
-    kmem_cache_free(blkif_cachep, blkif);
-    destroy->status = BLKIF_BE_STATUS_OKAY;
-}
-
-void blkif_ptfe_connect(blkif_be_connect_t *connect)
-{
-    domid_t        domid  = connect->domid;
-    unsigned int   handle = connect->blkif_handle;
-    unsigned int   evtchn = connect->evtchn;
-    unsigned long  shmem_frame = connect->shmem_frame;
-    struct vm_struct *vma;
-#ifdef CONFIG_XEN_BLKDEV_GRANT
-    int ref = connect->shmem_ref;
-#else
-    pgprot_t       prot;
-    int            error;
-#endif
-    blkif_t       *blkif;
-    blkif_sring_t *sring;
-
-    DPRINTK("PT got BE_CONNECT\n");
-
-    blkif = blkif_find_by_handle(domid, handle);
-    if ( unlikely(blkif == NULL) )
-    {
-        WPRINTK("blkif_connect attempted for non-existent blkif (%u,%u)\n", 
-                connect->domid, connect->blkif_handle); 
-        connect->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
-        return;
-    }
-
-    if ( (vma = get_vm_area(PAGE_SIZE, VM_IOREMAP)) == NULL )
-    {
-        connect->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
-        return;
-    }
-
-#ifndef CONFIG_XEN_BLKDEV_GRANT
-    prot = __pgprot(_KERNPG_TABLE);
-    error = direct_remap_area_pages(&init_mm, VMALLOC_VMADDR(vma->addr),
-                                    shmem_frame<<PAGE_SHIFT, PAGE_SIZE,
-                                    prot, domid);
-    if ( error != 0 )
-    {
-        if ( error == -ENOMEM ) 
-            connect->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
-        else if ( error == -EFAULT )
-            connect->status = BLKIF_BE_STATUS_MAPPING_ERROR;
-        else
-            connect->status = BLKIF_BE_STATUS_ERROR;
-        vfree(vma->addr);
-        return;
-    }
-#else
-    { /* Map: Use the Grant table reference */
-        struct gnttab_map_grant_ref op;
-        op.host_addr = VMALLOC_VMADDR(vma->addr);
-        op.flags            = GNTMAP_host_map;
-        op.ref              = ref;
-        op.dom              = domid;
-       
-        BUG_ON( HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1) );
-       
-        handle = op.handle;
-       
-        if (op.handle < 0) {
-            DPRINTK(" Grant table operation failure !\n");
-            connect->status = BLKIF_BE_STATUS_MAPPING_ERROR;
-            vfree(vma->addr);
-            return;
-        }
-
-        blkif->shmem_ref = ref;
-        blkif->shmem_handle = handle;
-        blkif->shmem_vaddr = VMALLOC_VMADDR(vma->addr);
-    }
-#endif
-
-    if ( blkif->status != DISCONNECTED )
-    {
-        connect->status = BLKIF_BE_STATUS_INTERFACE_CONNECTED;
-        vfree(vma->addr);
-        return;
-    }
-
-    sring = (blkif_sring_t *)vma->addr;
-    SHARED_RING_INIT(sring);
-    BACK_RING_INIT(&blkif->blk_ring, sring, PAGE_SIZE);
-    
-    blkif->evtchn        = evtchn;
-    blkif->shmem_frame   = shmem_frame;
-    blkif->status        = CONNECTED;
-    blkif_get(blkif);
-
-    bind_evtchn_to_irqhandler(
-        evtchn, blkif_ptfe_int, 0, "blkif-pt-backend", blkif);
-
-    connect->status = BLKIF_BE_STATUS_OKAY;
-}
-
-int blkif_ptfe_disconnect(blkif_be_disconnect_t *disconnect, u8 rsp_id)
-{
-    domid_t       domid  = disconnect->domid;
-    unsigned int  handle = disconnect->blkif_handle;
-    blkif_t      *blkif;
-
-    DPRINTK("PT got BE_DISCONNECT\n");
-    
-    blkif = blkif_find_by_handle(domid, handle);
-    if ( unlikely(blkif == NULL) )
-    {
-        WPRINTK("blkif_disconnect attempted for non-existent blkif"
-                " (%u,%u)\n", disconnect->domid, disconnect->blkif_handle); 
-        disconnect->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
-        return 1; /* Caller will send response error message. */
-    }
-
-    if ( blkif->status == CONNECTED )
-    {
-        blkif->status = DISCONNECTING;
-        blkif->disconnect_rspid = rsp_id;
-        wmb(); /* Let other CPUs see the status change. */
-        unbind_evtchn_from_irqhandler(blkif->evtchn, blkif);
-        blkif_deschedule(blkif);
-        blkif_put(blkif);
-        return 0; /* Caller should not send response message. */
-    }
-
-    disconnect->status = BLKIF_BE_STATUS_OKAY;
-    return 1;
-}
-
-/*-----[ Control Messages to/from Backend VM ]----------------------------*/
-
-/* Tell the controller to bring up the interface. */
-static void blkif_ptbe_send_interface_connect(void)
-{
-    ctrl_msg_t cmsg = {
-        .type    = CMSG_BLKIF_FE,
-        .subtype = CMSG_BLKIF_FE_INTERFACE_CONNECT,
-        .length  = sizeof(blkif_fe_interface_connect_t),
-    };
-    blkif_fe_interface_connect_t *msg = (void*)cmsg.msg;
-    msg->handle      = 0;
-    msg->shmem_frame = virt_to_mfn(blktap_be_ring.sring);
-    
-    ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
-}
-
-static void blkif_ptbe_close(void)
-{
-}
-
-/* Move from CLOSED to DISCONNECTED state. */
-static void blkif_ptbe_disconnect(void)
-{
-    blkif_sring_t *sring;
-    
-    sring = (blkif_sring_t *)__get_free_page(GFP_KERNEL);
-    SHARED_RING_INIT(sring);
-    FRONT_RING_INIT(&blktap_be_ring, sring, PAGE_SIZE);
-    blktap_be_state  = BLKIF_STATE_DISCONNECTED;
-    DPRINTK("Blkif-Passthrough-BE is now DISCONNECTED.\n");
-    blkif_ptbe_send_interface_connect();
-}
-
-static void blkif_ptbe_connect(blkif_fe_interface_status_t *status)
-{
-    int err = 0;
-    
-    blktap_be_evtchn = status->evtchn;
-
-    err = bind_evtchn_to_irqhandler(
-        blktap_be_evtchn, blkif_ptbe_int, SA_SAMPLE_RANDOM, "blkif", NULL);
-    if ( err ) {
-       WPRINTK("blkfront bind_evtchn_to_irqhandler failed (%d)\n", err);
-        return;
-    } else {
-       /* transtion to connected in case we need to do a 
-           a partion probe on a whole disk */
-        blktap_be_state = BLKIF_STATE_CONNECTED;
-    }
-}
-
-static void unexpected(blkif_fe_interface_status_t *status)
-{
-    WPRINTK(" TAP: Unexpected blkif status %s in state %s\n", 
-           blkif_status_name[status->status],
-           blkif_state_name[blktap_be_state]);
-}
-
-static void blkif_ptbe_status(
-    blkif_fe_interface_status_t *status)
-{
-    if ( status->handle != 0 )
-    {
-        DPRINTK("Status change on unsupported blkif %d\n",
-               status->handle);
-        return;
-    }
-
-    DPRINTK("ptbe_status: got %s\n", blkif_status_name[status->status]);
-    
-    switch ( status->status )
-    {
-    case BLKIF_INTERFACE_STATUS_CLOSED:
-        switch ( blktap_be_state )
-        {
-        case BLKIF_STATE_CLOSED:
-            unexpected(status);
-            break;
-        case BLKIF_STATE_DISCONNECTED:
-        case BLKIF_STATE_CONNECTED:
-            unexpected(status);
-            blkif_ptbe_close();
-            break;
-        }
-        break;
-        
-    case BLKIF_INTERFACE_STATUS_DISCONNECTED:
-        switch ( blktap_be_state )
-        {
-        case BLKIF_STATE_CLOSED:
-            blkif_ptbe_disconnect();
-            break;
-        case BLKIF_STATE_DISCONNECTED:
-        case BLKIF_STATE_CONNECTED:
-            printk(KERN_ALERT "*** add recovery code to the tap driver. ***\n");
-            unexpected(status);
-            break;
-        }
-        break;
-        
-    case BLKIF_INTERFACE_STATUS_CONNECTED:
-        switch ( blktap_be_state )
-        {
-        case BLKIF_STATE_CLOSED:
-            unexpected(status);
-            blkif_ptbe_disconnect();
-            blkif_ptbe_connect(status);
-            break;
-        case BLKIF_STATE_DISCONNECTED:
-            blkif_ptbe_connect(status);
-            break;
-        case BLKIF_STATE_CONNECTED:
-            unexpected(status);
-            blkif_ptbe_connect(status);
-            break;
-        }
-        break;
-
-   case BLKIF_INTERFACE_STATUS_CHANGED:
-        switch ( blktap_be_state )
-        {
-        case BLKIF_STATE_CLOSED:
-        case BLKIF_STATE_DISCONNECTED:
-            unexpected(status);
-            break;
-        case BLKIF_STATE_CONNECTED:
-            /* vbd_update(); */
-            /* tap doesn't really get state changes... */
-            unexpected(status);
-            break;
-        }
-       break;
-       
-    default:
-        DPRINTK("Status change to unknown value %d\n", status->status);
-        break;
-    }
-}
-
-/*-----[ All control messages enter here: ]-------------------------------*/
-
-void blkif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id)
-{
-    switch ( msg->type )
-    {
-    case CMSG_BLKIF_FE:
-
-        switch ( msg->subtype )
-        {
-        case CMSG_BLKIF_FE_INTERFACE_STATUS:
-            blkif_ptbe_status((blkif_fe_interface_status_t *) &msg->msg[0]);
-            break;
-
-        default:
-            goto parse_error;
-        }
-
-        break;
-
-    case CMSG_BLKIF_BE:
-        
-        /* send a copy of the message to user if wanted */
-        
-        if ( (blktap_mode & BLKTAP_MODE_INTERCEPT_FE) ||
-             (blktap_mode & BLKTAP_MODE_COPY_FE) ) {
-            
-            blktap_write_ctrl_ring(msg);
-            blktap_kick_user();
-        }
-        
-        switch ( msg->subtype )
-        {
-        case CMSG_BLKIF_BE_CREATE:
-            blkif_ptfe_create((blkif_be_create_t *)&msg->msg[0]);
-            break; 
-        case CMSG_BLKIF_BE_DESTROY:
-            blkif_ptfe_destroy((blkif_be_destroy_t *)&msg->msg[0]);
-            break;        
-        case CMSG_BLKIF_BE_CONNECT:
-            blkif_ptfe_connect((blkif_be_connect_t *)&msg->msg[0]);
-            break;        
-        case CMSG_BLKIF_BE_DISCONNECT:
-            if ( !blkif_ptfe_disconnect((blkif_be_disconnect_t *)&msg->msg[0],
-                    msg->id) )
-                return;
-            break;        
-
-        /* We just ignore anything to do with vbds for now. */
-        
-        case CMSG_BLKIF_BE_VBD_CREATE:
-            DPRINTK("PT got VBD_CREATE\n");
-            ((blkif_be_vbd_create_t *)&msg->msg[0])->status 
-                = BLKIF_BE_STATUS_OKAY;
-            break;
-        case CMSG_BLKIF_BE_VBD_DESTROY:
-            DPRINTK("PT got VBD_DESTROY\n");
-            ((blkif_be_vbd_destroy_t *)&msg->msg[0])->status
-                = BLKIF_BE_STATUS_OKAY;
-            break;
-        default:
-            goto parse_error;
-        }
-
-        break;
-    }
-
-    ctrl_if_send_response(msg);
-    return;
-
- parse_error:
-    msg->length = 0;
-    ctrl_if_send_response(msg);
-}
-
-/*-----[ Initialization ]-------------------------------------------------*/
-
-void __init blkif_interface_init(void)
-{
-    blkif_cachep = kmem_cache_create("blkif_cache", sizeof(blkif_t), 
-                                     0, 0, NULL, NULL);
-    memset(blkif_hash, 0, sizeof(blkif_hash));
-    
-    blktap_be_ring.sring = NULL;
-}
-
-
-
-/* Debug : print the current ring indices. */
-
-void print_fe_ring_idxs(void)
-{
-    int i;
-    blkif_t *blkif;
-            
-    WPRINTK("FE Rings: \n---------\n");
-    for ( i = 0; i < BLKIF_HASHSZ; i++) { 
-        blkif = blkif_hash[i];
-        while (blkif != NULL) {
-            if (blkif->status == DISCONNECTED) {
-                WPRINTK("(%2d,%2d) DISCONNECTED\n", 
-                   blkif->domid, blkif->handle);
-            } else if (blkif->status == DISCONNECTING) {
-                WPRINTK("(%2d,%2d) DISCONNECTING\n", 
-                   blkif->domid, blkif->handle);
-            } else if (blkif->blk_ring.sring == NULL) {
-                WPRINTK("(%2d,%2d) CONNECTED, but null sring!\n", 
-                   blkif->domid, blkif->handle);
-            } else {
-                blkif_get(blkif);
-                WPRINTK("(%2d,%2d): req_cons: %2d, rsp_prod_prv: %2d "
-                    "| req_prod: %2d, rsp_prod: %2d\n",
-                    blkif->domid, blkif->handle,
-                    blkif->blk_ring.req_cons,
-                    blkif->blk_ring.rsp_prod_pvt,
-                    blkif->blk_ring.sring->req_prod,
-                    blkif->blk_ring.sring->rsp_prod);
-                blkif_put(blkif);
-            } 
-            blkif = blkif->hash_next;
-        }
-    }
-}        
diff --git a/linux-2.6-xen-sparse/drivers/xen/blktap/blktap_datapath.c b/linux-2.6-xen-sparse/drivers/xen/blktap/blktap_datapath.c
deleted file mode 100644 (file)
index b446fbe..0000000
+++ /dev/null
@@ -1,449 +0,0 @@
-/******************************************************************************
- * blktap_datapath.c
- * 
- * XenLinux virtual block-device tap.
- * Block request routing data path.
- * 
- * Copyright (c) 2004, Andrew Warfield
- * -- see full header in blktap.c
- */
-#include "blktap.h"
-#include <asm-xen/evtchn.h>
-
-/*-----[ The data paths ]-------------------------------------------------*/
-
-/* Connection to a single backend domain. */
-blkif_front_ring_t blktap_be_ring;
-
-/*-----[ Tracking active requests ]---------------------------------------*/
-
-/* this must be the same as MAX_PENDING_REQS in blkback.c */
-#define MAX_ACTIVE_REQS ((ACTIVE_RING_IDX)64U)
-
-active_req_t     active_reqs[MAX_ACTIVE_REQS];
-ACTIVE_RING_IDX  active_req_ring[MAX_ACTIVE_REQS];
-spinlock_t       active_req_lock = SPIN_LOCK_UNLOCKED;
-ACTIVE_RING_IDX  active_prod, active_cons;
-#define MASK_ACTIVE_IDX(_i) ((_i)&(MAX_ACTIVE_REQS-1))
-#define ACTIVE_IDX(_ar) (_ar - active_reqs)
-#define NR_ACTIVE_REQS (MAX_ACTIVE_REQS - active_prod + active_cons)
-
-inline active_req_t *get_active_req(void) 
-{
-    ACTIVE_RING_IDX idx;
-    active_req_t *ar;
-    unsigned long flags;
-        
-    ASSERT(active_cons != active_prod);   
-    
-    spin_lock_irqsave(&active_req_lock, flags);
-    idx =  active_req_ring[MASK_ACTIVE_IDX(active_cons++)];
-    ar = &active_reqs[idx];
-    spin_unlock_irqrestore(&active_req_lock, flags);
-    
-    return ar;
-}
-
-inline void free_active_req(active_req_t *ar) 
-{
-    unsigned long flags;
-        
-    spin_lock_irqsave(&active_req_lock, flags);
-    active_req_ring[MASK_ACTIVE_IDX(active_prod++)] = ACTIVE_IDX(ar);
-    spin_unlock_irqrestore(&active_req_lock, flags);
-}
-
-active_req_t *lookup_active_req(ACTIVE_RING_IDX idx)
-{
-    return &active_reqs[idx];   
-}
-
-void active_reqs_init(void)
-{
-    ACTIVE_RING_IDX i;
-    
-    active_cons = 0;
-    active_prod = MAX_ACTIVE_REQS;
-    memset(active_reqs, 0, sizeof(active_reqs));
-    for ( i = 0; i < MAX_ACTIVE_REQS; i++ )
-        active_req_ring[i] = i;
-}
-
-/* Requests passing through the tap to the backend hijack the id field
- * in the request message.  In it we put the AR index _AND_ the fe domid.
- * the domid is used by the backend to map the pages properly.
- */
-
-static inline unsigned long MAKE_ID(domid_t fe_dom, ACTIVE_RING_IDX idx)
-{
-    return ( (fe_dom << 16) | MASK_ACTIVE_IDX(idx) );
-}
-
-/*-----[ Ring helpers ]---------------------------------------------------*/
-
-static void maybe_trigger_blktap_schedule(void);
-
-inline int write_resp_to_fe_ring(blkif_t *blkif, blkif_response_t *rsp)
-{
-    blkif_response_t *resp_d;
-    active_req_t *ar;
-    
-    ar = &active_reqs[ID_TO_IDX(rsp->id)];
-    rsp->id = ar->id;
-            
-    resp_d = RING_GET_RESPONSE(&blkif->blk_ring,
-            blkif->blk_ring.rsp_prod_pvt);
-    memcpy(resp_d, rsp, sizeof(blkif_response_t));
-    wmb();
-    blkif->blk_ring.rsp_prod_pvt++;
-            
-    blkif_put(ar->blkif);
-    free_active_req(ar);
-    
-    return 0;
-}
-
-inline int write_req_to_be_ring(blkif_request_t *req)
-{
-    blkif_request_t *req_d;
-
-    if ( blktap_be_state != BLKIF_STATE_CONNECTED ) {
-        WPRINTK("Tap trying to access an unconnected backend!\n");
-        return 0;
-    }
-    
-    req_d = RING_GET_REQUEST(&blktap_be_ring,
-            blktap_be_ring.req_prod_pvt);
-    memcpy(req_d, req, sizeof(blkif_request_t));
-    wmb();
-    blktap_be_ring.req_prod_pvt++;
-            
-    return 0;
-}
-
-void kick_fe_domain(blkif_t *blkif) 
-{
-    RING_PUSH_RESPONSES(&blkif->blk_ring);
-    notify_via_evtchn(blkif->evtchn);
-    DPRINTK("notified FE(dom %u)\n", blkif->domid);
-
-    /* We just feed up a batch of request slots... */
-    maybe_trigger_blktap_schedule();
-    
-}
-
-void kick_be_domain(void)
-{
-    if ( blktap_be_state != BLKIF_STATE_CONNECTED ) 
-        return;
-    
-    wmb(); /* Ensure that the frontend can see the requests. */
-    RING_PUSH_REQUESTS(&blktap_be_ring);
-    notify_via_evtchn(blktap_be_evtchn);
-    DPRINTK("notified BE\n");
-}
-
-/*-----[ Data to/from Frontend (client) VMs ]-----------------------------*/
-
-/*-----[ Scheduler list maint -from blkback ]--- */
-
-static struct list_head blkio_schedule_list;
-static spinlock_t blkio_schedule_list_lock;
-
-static int __on_blkdev_list(blkif_t *blkif)
-{
-    return blkif->blkdev_list.next != NULL;
-}
-
-static void remove_from_blkdev_list(blkif_t *blkif)
-{
-    unsigned long flags;
-    if ( !__on_blkdev_list(blkif) ) return;
-    spin_lock_irqsave(&blkio_schedule_list_lock, flags);
-    if ( __on_blkdev_list(blkif) )
-    {
-        list_del(&blkif->blkdev_list);
-        blkif->blkdev_list.next = NULL;
-        blkif_put(blkif);
-    }
-    spin_unlock_irqrestore(&blkio_schedule_list_lock, flags);
-}
-
-static void add_to_blkdev_list_tail(blkif_t *blkif)
-{
-    unsigned long flags;
-    if ( __on_blkdev_list(blkif) ) return;
-    spin_lock_irqsave(&blkio_schedule_list_lock, flags);
-    if ( !__on_blkdev_list(blkif) && (blkif->status == CONNECTED) )
-    {
-        list_add_tail(&blkif->blkdev_list, &blkio_schedule_list);
-        blkif_get(blkif);
-    }
-    spin_unlock_irqrestore(&blkio_schedule_list_lock, flags);
-}
-
-
-/*-----[ Scheduler functions - from blkback ]--- */
-
-static DECLARE_WAIT_QUEUE_HEAD(blkio_schedule_wait);
-
-static int do_block_io_op(blkif_t *blkif, int max_to_do);
-
-static int blkio_schedule(void *arg)
-{
-    DECLARE_WAITQUEUE(wq, current);
-
-    blkif_t          *blkif;
-    struct list_head *ent;
-
-    daemonize(
-        "xentapd"
-        );
-
-    for ( ; ; )
-    {
-        /* Wait for work to do. */
-        add_wait_queue(&blkio_schedule_wait, &wq);
-        set_current_state(TASK_INTERRUPTIBLE);
-        if ( (NR_ACTIVE_REQS == MAX_ACTIVE_REQS) || 
-             list_empty(&blkio_schedule_list) )
-            schedule();
-        __set_current_state(TASK_RUNNING);
-        remove_wait_queue(&blkio_schedule_wait, &wq);
-
-        /* Queue up a batch of requests. */
-        while ( (NR_ACTIVE_REQS < MAX_ACTIVE_REQS) &&
-                !list_empty(&blkio_schedule_list) )
-        {
-            ent = blkio_schedule_list.next;
-            blkif = list_entry(ent, blkif_t, blkdev_list);
-            blkif_get(blkif);
-            remove_from_blkdev_list(blkif);
-            if ( do_block_io_op(blkif, BATCH_PER_DOMAIN) )
-                add_to_blkdev_list_tail(blkif);
-            blkif_put(blkif);
-        }
-    }
-}
-
-static void maybe_trigger_blktap_schedule(void)
-{
-    /*
-     * Needed so that two processes, who together make the following predicate
-     * true, don't both read stale values and evaluate the predicate
-     * incorrectly. Incredibly unlikely to stall the scheduler on x86, but...
-     */
-    smp_mb();
-
-    if ( (NR_ACTIVE_REQS < (MAX_ACTIVE_REQS/2)) &&
-         !list_empty(&blkio_schedule_list) ) 
-        wake_up(&blkio_schedule_wait);
-}
-
-void blkif_deschedule(blkif_t *blkif)
-{
-    remove_from_blkdev_list(blkif);
-}
-
-void __init blkdev_schedule_init(void)
-{
-    spin_lock_init(&blkio_schedule_list_lock);
-    INIT_LIST_HEAD(&blkio_schedule_list);
-
-    if ( kernel_thread(blkio_schedule, 0, CLONE_FS | CLONE_FILES) < 0 )
-        BUG();
-}
-    
-/*-----[ Interrupt entry from a frontend ]------ */
-
-irqreturn_t blkif_ptfe_int(int irq, void *dev_id, struct pt_regs *regs)
-{
-    blkif_t *blkif = dev_id;
-
-    add_to_blkdev_list_tail(blkif);
-    maybe_trigger_blktap_schedule();
-    return IRQ_HANDLED;
-}
-
-/*-----[ Other Frontend Ring functions ]-------- */
-
-/* irqreturn_t blkif_ptfe_int(int irq, void *dev_id, struct pt_regs *regs)*/
-static int do_block_io_op(blkif_t *blkif, int max_to_do)
-{
-    /* we have pending messages from the real frontend. */
-
-    blkif_request_t *req_s;
-    RING_IDX i, rp;
-    unsigned long flags;
-    active_req_t *ar;
-    int more_to_do = 0;
-    int notify_be = 0, notify_user = 0;
-    
-    /* lock both rings */
-    spin_lock_irqsave(&blkif_io_lock, flags);
-
-    rp = blkif->blk_ring.sring->req_prod;
-    rmb();
-    
-    for ( i = blkif->blk_ring.req_cons; 
-         (i != rp) && 
-            !RING_REQUEST_CONS_OVERFLOW(&blkif->blk_ring, i);
-          i++ )
-    {
-        
-        if ((--max_to_do == 0) || (NR_ACTIVE_REQS == MAX_ACTIVE_REQS)) 
-        {
-            more_to_do = 1;
-            break;
-        }
-        
-        req_s = RING_GET_REQUEST(&blkif->blk_ring, i);
-        /* This is a new request:  
-         * Assign an active request record, and remap the id. 
-         */
-        ar = get_active_req();
-        ar->id = req_s->id;
-        ar->nr_pages = req_s->nr_segments; 
-        blkif_get(blkif);
-        ar->blkif = blkif;
-        req_s->id = MAKE_ID(blkif->domid, ACTIVE_IDX(ar));
-        /* WPRINTK("%3u < %3lu\n", ID_TO_IDX(req_s->id), ar->id); */
-
-        /* FE -> BE interposition point is here. */
-        
-        /* ------------------------------------------------------------- */
-        /* BLKIF_OP_PROBE_HACK:                                          */
-        /* Signal to the backend that we are a tap domain.               */
-
-        if (req_s->operation == BLKIF_OP_PROBE) {
-            DPRINTK("Adding BLKTAP_COOKIE to PROBE request.\n");
-            req_s->frame_and_sects[1] = BLKTAP_COOKIE;
-        }
-
-        /* ------------------------------------------------------------- */
-
-        /* If we are in MODE_INTERCEPT_FE or MODE_COPY_FE: */
-        if ( (blktap_mode & BLKTAP_MODE_INTERCEPT_FE) ||
-             (blktap_mode & BLKTAP_MODE_COPY_FE) ) {
-            
-            /* Copy the response message to UFERing */
-            /* In MODE_INTERCEPT_FE, map attached pages into the app vma */
-            /* In MODE_COPY_FE_PAGES, copy attached pages into the app vma */
-
-            DPRINTK("req->UFERing\n"); 
-            blktap_write_fe_ring(req_s);
-            notify_user = 1;
-        }
-
-        /* If we are not in MODE_INTERCEPT_FE or MODE_INTERCEPT_BE: */
-        if ( !((blktap_mode & BLKTAP_MODE_INTERCEPT_FE) ||
-               (blktap_mode & BLKTAP_MODE_INTERCEPT_BE)) ) {
-            
-            /* be included to prevent noise from the fe when its off */
-            /* copy the request message to the BERing */
-
-            DPRINTK("blktap: FERing[%u] -> BERing[%u]\n", 
-                    (unsigned)i & (RING_SIZE(&blktap_be_ring)-1),
-                    (unsigned)blktap_be_ring.req_prod_pvt & 
-                    (RING_SIZE((&blktap_be_ring)-1)));
-            
-            write_req_to_be_ring(req_s);
-            notify_be = 1;
-        }
-    }
-
-    blkif->blk_ring.req_cons = i;
-    
-    /* unlock rings */
-    spin_unlock_irqrestore(&blkif_io_lock, flags);
-    
-    if (notify_user)
-        blktap_kick_user();
-    if (notify_be)
-        kick_be_domain();
-    
-    return more_to_do;
-}
-
-/*-----[ Data to/from Backend (server) VM ]------------------------------*/
-
-
-irqreturn_t blkif_ptbe_int(int irq, void *dev_id, 
-                                  struct pt_regs *ptregs)
-{
-    blkif_response_t  *resp_s;
-    blkif_t *blkif;
-    RING_IDX rp, i;
-    unsigned long flags;
-
-    DPRINTK("PT got BE interrupt.\n");
-
-    /* lock both rings */
-    spin_lock_irqsave(&blkif_io_lock, flags);
-    
-    rp = blktap_be_ring.sring->rsp_prod;
-    rmb();
-      
-    for ( i = blktap_be_ring.rsp_cons; i != rp; i++)
-    {
-        resp_s = RING_GET_RESPONSE(&blktap_be_ring, i);
-        
-        /* BE -> FE interposition point is here. */
-    
-        blkif = active_reqs[ID_TO_IDX(resp_s->id)].blkif;
-        
-        /* If we are in MODE_INTERCEPT_BE or MODE_COPY_BE: */
-        if ( (blktap_mode & BLKTAP_MODE_INTERCEPT_BE) ||
-             (blktap_mode & BLKTAP_MODE_COPY_BE) ) {
-
-            /* Copy the response message to UBERing */
-            /* In MODE_INTERCEPT_BE, map attached pages into the app vma */
-            /* In MODE_COPY_BE_PAGES, copy attached pages into the app vma */
-
-            DPRINTK("rsp->UBERing\n"); 
-            blktap_write_be_ring(resp_s);
-            blktap_kick_user();
-
-        }
-       
-        /* If we are NOT in MODE_INTERCEPT_BE or MODE_INTERCEPT_FE: */
-        if ( !((blktap_mode & BLKTAP_MODE_INTERCEPT_BE) ||
-               (blktap_mode & BLKTAP_MODE_INTERCEPT_FE)) ) {
-            
-            /* (fe included to prevent random interference from the BE) */
-            /* Copy the response message to FERing */
-         
-            DPRINTK("blktap: BERing[%u] -> FERing[%u]\n", 
-                    (unsigned)i & (RING_SIZE(&blkif->blk_ring)-1),
-                    (unsigned)blkif->blk_ring.rsp_prod_pvt & 
-                    (RING_SIZE((&blkif->blk_ring)-1)));
-
-            write_resp_to_fe_ring(blkif, resp_s);
-            kick_fe_domain(blkif);
-
-        }
-    }
-    
-    blktap_be_ring.rsp_cons = i;
-    
-
-    spin_unlock_irqrestore(&blkif_io_lock, flags);
-    
-    return IRQ_HANDLED;
-}
-
-/* Debug : print the current ring indices. */
-
-void print_be_ring_idxs(void)
-{
-    if (blktap_be_ring.sring != NULL) {
-        WPRINTK("BE Ring: \n--------\n");
-        WPRINTK("BE: rsp_cons: %2d, req_prod_prv: %2d "
-            "| req_prod: %2d, rsp_prod: %2d\n",
-            blktap_be_ring.rsp_cons,
-            blktap_be_ring.req_prod_pvt,
-            blktap_be_ring.sring->req_prod,
-            blktap_be_ring.sring->rsp_prod);
-    }
-}        
diff --git a/linux-2.6-xen-sparse/drivers/xen/blktap/blktap_userdev.c b/linux-2.6-xen-sparse/drivers/xen/blktap/blktap_userdev.c
deleted file mode 100644 (file)
index 4c06b3e..0000000
+++ /dev/null
@@ -1,801 +0,0 @@
-/******************************************************************************
- * blktap_userdev.c
- * 
- * XenLinux virtual block-device tap.
- * Control interface between the driver and a character device.
- * 
- * Copyright (c) 2004, Andrew Warfield
- */
-
-#include <linux/config.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/miscdevice.h>
-#include <linux/errno.h>
-#include <linux/major.h>
-#include <linux/gfp.h>
-#include <linux/poll.h>
-#include <asm/pgalloc.h>
-#include <asm/tlbflush.h>
-#include <asm-xen/xen-public/io/blkif.h> /* for control ring. */
-#ifdef CONFIG_XEN_BLKDEV_GRANT
-#include <asm-xen/xen-public/grant_table.h>
-#endif
-
-#include "blktap.h"
-
-
-unsigned long blktap_mode = BLKTAP_MODE_PASSTHROUGH;
-
-/* Only one process may open /dev/xen/blktap at any time. */
-static unsigned long blktap_dev_inuse;
-unsigned long blktap_ring_ok; /* make this ring->state */
-
-/* for poll: */
-static wait_queue_head_t blktap_wait;
-
-/* Rings up to user space. */
-static blkif_front_ring_t blktap_ufe_ring;
-static blkif_back_ring_t  blktap_ube_ring;
-static ctrl_front_ring_t  blktap_uctrl_ring;
-
-/* local prototypes */
-static int blktap_read_fe_ring(void);
-static int blktap_read_be_ring(void);
-
-
-/* -------[ mmap region ]--------------------------------------------- */
-/*
- * We use a big chunk of address space to map in-flight requests into,
- * and export this region up to user-space.  See the comments in blkback
- * about this -- the two must be kept in sync if the tap is used as a 
- * passthrough.
- */
-
-#define MAX_PENDING_REQS 64
-
-/* immediately before the mmap area, we have a bunch of pages reserved
- * for shared memory rings.
- */
-#define RING_PAGES 3 /* Ctrl, Front, and Back */ 
-
-/* Where things are inside the device mapping. */
-struct vm_area_struct *blktap_vma = NULL;
-unsigned long mmap_vstart;  /* Kernel pages for mapping in data. */
-unsigned long rings_vstart; /* start of mmaped vma               */
-unsigned long user_vstart;  /* start of user mappings            */
-
-#define MMAP_PAGES_PER_REQUEST \
-    (BLKIF_MAX_SEGMENTS_PER_REQUEST + 1)
-#define MMAP_PAGES             \
-    (MAX_PENDING_REQS * MMAP_PAGES_PER_REQUEST)
-#define MMAP_VADDR(_start, _req,_seg)                \
-    ( _start +                                       \
-     ((_req) * MMAP_PAGES_PER_REQUEST * PAGE_SIZE) + \
-     ((_seg) * PAGE_SIZE))
-
-/* -------[ grant handles ]------------------------------------------- */
-
-#ifdef CONFIG_XEN_BLKDEV_GRANT
-/* When using grant tables to map a frame for device access then the
- * handle returned must be used to unmap the frame. This is needed to
- * drop the ref count on the frame.
- */
-struct grant_handle_pair
-{
-    u16  kernel;
-    u16  user;
-};
-static struct grant_handle_pair pending_grant_handles[MMAP_PAGES];
-#define pending_handle(_idx, _i) \
-    (pending_grant_handles[((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) + (_i)])
-#define BLKTAP_INVALID_HANDLE(_g) \
-    (((_g->kernel) == 0xFFFF) && ((_g->user) == 0xFFFF))
-#define BLKTAP_INVALIDATE_HANDLE(_g) do {       \
-    (_g)->kernel = 0xFFFF; (_g)->user = 0xFFFF; \
-    } while(0)
-    
-#endif
-
-
-/* -------[ blktap vm ops ]------------------------------------------- */
-
-static struct page *blktap_nopage(struct vm_area_struct *vma,
-                                             unsigned long address,
-                                             int *type)
-{
-    /*
-     * if the page has not been mapped in by the driver then generate
-     * a SIGBUS to the domain.
-     */
-
-    force_sig(SIGBUS, current);
-
-    return 0;
-}
-
-struct vm_operations_struct blktap_vm_ops = {
-    nopage:   blktap_nopage,
-};
-
-/* -------[ blktap file ops ]----------------------------------------- */
-
-static int blktap_open(struct inode *inode, struct file *filp)
-{
-    blkif_sring_t *sring;
-    ctrl_sring_t *csring;
-    
-    if ( test_and_set_bit(0, &blktap_dev_inuse) )
-        return -EBUSY;
-    
-    /* Allocate the ctrl ring. */
-    csring = (ctrl_sring_t *)get_zeroed_page(GFP_KERNEL);
-    if (csring == NULL)
-        goto fail_nomem;
-
-    SetPageReserved(virt_to_page(csring));
-    
-    SHARED_RING_INIT(csring);
-    FRONT_RING_INIT(&blktap_uctrl_ring, csring, PAGE_SIZE);
-
-    /* Allocate the fe ring. */
-    sring = (blkif_sring_t *)get_zeroed_page(GFP_KERNEL);
-    if (sring == NULL)
-        goto fail_free_ctrl;
-
-    SetPageReserved(virt_to_page(sring));
-    
-    SHARED_RING_INIT(sring);
-    FRONT_RING_INIT(&blktap_ufe_ring, sring, PAGE_SIZE);
-
-    /* Allocate the be ring. */
-    sring = (blkif_sring_t *)get_zeroed_page(GFP_KERNEL);
-    if (sring == NULL)
-        goto fail_free_fe;
-
-    SetPageReserved(virt_to_page(sring));
-    
-    SHARED_RING_INIT(sring);
-    BACK_RING_INIT(&blktap_ube_ring, sring, PAGE_SIZE);
-
-    DPRINTK(KERN_ALERT "blktap open.\n");
-
-    return 0;
-    
- fail_free_ctrl:
-    free_page( (unsigned long) blktap_uctrl_ring.sring);
-
- fail_free_fe:
-    free_page( (unsigned long) blktap_ufe_ring.sring);
-
- fail_nomem:
-    return -ENOMEM;
-}
-
-static int blktap_release(struct inode *inode, struct file *filp)
-{
-    blktap_dev_inuse = 0;
-    blktap_ring_ok = 0;
-
-    DPRINTK(KERN_ALERT "blktap closed.\n");
-
-    /* Free the ring page. */
-    ClearPageReserved(virt_to_page(blktap_uctrl_ring.sring));
-    free_page((unsigned long) blktap_uctrl_ring.sring);
-
-    ClearPageReserved(virt_to_page(blktap_ufe_ring.sring));
-    free_page((unsigned long) blktap_ufe_ring.sring);
-
-    ClearPageReserved(virt_to_page(blktap_ube_ring.sring));
-    free_page((unsigned long) blktap_ube_ring.sring);
-
-    /* Clear any active mappings and free foreign map table */
-    if (blktap_vma != NULL) {
-        zap_page_range(blktap_vma, blktap_vma->vm_start, 
-                       blktap_vma->vm_end - blktap_vma->vm_start, NULL);
-        blktap_vma = NULL;
-    }
-
-    return 0;
-}
-
-/* Note on mmap:
- * We need to map pages to user space in a way that will allow the block
- * subsystem set up direct IO to them.  This couldn't be done before, because
- * there isn't really a sane way to make a user virtual address down to a 
- * physical address when the page belongs to another domain.
- *
- * My first approach was to map the page in to kernel memory, add an entry
- * for it in the physical frame list (using alloc_lomem_region as in blkback)
- * and then attempt to map that page up to user space.  This is disallowed
- * by xen though, which realizes that we don't really own the machine frame
- * underlying the physical page.
- *
- * The new approach is to provide explicit support for this in xen linux.
- * The VMA now has a flag, VM_FOREIGN, to indicate that it contains pages
- * mapped from other vms.  vma->vm_private_data is set up as a mapping 
- * from pages to actual page structs.  There is a new clause in get_user_pages
- * that does the right thing for this sort of mapping.
- * 
- * blktap_mmap sets up this mapping.  Most of the real work is done in
- * blktap_write_fe_ring below.
- */
-static int blktap_mmap(struct file *filp, struct vm_area_struct *vma)
-{
-    int size;
-    struct page **map;
-    int i;
-
-    DPRINTK(KERN_ALERT "blktap mmap (%lx, %lx)\n",
-           vma->vm_start, vma->vm_end);
-
-    vma->vm_flags |= VM_RESERVED;
-    vma->vm_ops = &blktap_vm_ops;
-
-    size = vma->vm_end - vma->vm_start;
-    if ( size != ( (MMAP_PAGES + RING_PAGES) << PAGE_SHIFT ) ) {
-        printk(KERN_INFO 
-               "blktap: you _must_ map exactly %d pages!\n",
-               MMAP_PAGES + RING_PAGES);
-        return -EAGAIN;
-    }
-
-    size >>= PAGE_SHIFT;
-    DPRINTK(KERN_INFO "blktap: 2 rings + %d pages.\n", size-1);
-    
-    rings_vstart = vma->vm_start;
-    user_vstart  = rings_vstart + (RING_PAGES << PAGE_SHIFT);
-    
-    /* Map the ring pages to the start of the region and reserve it. */
-
-    /* not sure if I really need to do this... */
-    vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
-
-    DPRINTK("Mapping ctrl_ring page %lx.\n", __pa(blktap_uctrl_ring.sring));
-    if (remap_pfn_range(vma, vma->vm_start, 
-                         __pa(blktap_uctrl_ring.sring) >> PAGE_SHIFT, 
-                         PAGE_SIZE, vma->vm_page_prot)) 
-        goto fail;
-
-
-    DPRINTK("Mapping be_ring page %lx.\n", __pa(blktap_ube_ring.sring));
-    if (remap_pfn_range(vma, vma->vm_start + PAGE_SIZE, 
-                         __pa(blktap_ube_ring.sring) >> PAGE_SHIFT, 
-                         PAGE_SIZE, vma->vm_page_prot)) 
-        goto fail;
-
-    DPRINTK("Mapping fe_ring page %lx.\n", __pa(blktap_ufe_ring.sring));
-    if (remap_pfn_range(vma, vma->vm_start + ( 2 * PAGE_SIZE ), 
-                         __pa(blktap_ufe_ring.sring) >> PAGE_SHIFT, 
-                         PAGE_SIZE, vma->vm_page_prot)) 
-        goto fail;
-
-    /* Mark this VM as containing foreign pages, and set up mappings. */
-    map = kmalloc(((vma->vm_end - vma->vm_start) >> PAGE_SHIFT)
-                  * sizeof(struct page_struct*),
-                  GFP_KERNEL);
-    if (map == NULL) goto fail;
-
-    for (i=0; i<((vma->vm_end - vma->vm_start) >> PAGE_SHIFT); i++)
-        map[i] = NULL;
-    
-    vma->vm_private_data = map;
-    vma->vm_flags |= VM_FOREIGN;
-
-    blktap_vma = vma;
-    blktap_ring_ok = 1;
-
-    return 0;
- fail:
-    /* Clear any active mappings. */
-    zap_page_range(vma, vma->vm_start, 
-                   vma->vm_end - vma->vm_start, NULL);
-
-    return -ENOMEM;
-}
-
-static int blktap_ioctl(struct inode *inode, struct file *filp,
-                        unsigned int cmd, unsigned long arg)
-{
-    switch(cmd) {
-    case BLKTAP_IOCTL_KICK_FE: /* There are fe messages to process. */
-        return blktap_read_fe_ring();
-
-    case BLKTAP_IOCTL_KICK_BE: /* There are be messages to process. */
-        return blktap_read_be_ring();
-
-    case BLKTAP_IOCTL_SETMODE:
-        if (BLKTAP_MODE_VALID(arg)) {
-            blktap_mode = arg;
-            /* XXX: may need to flush rings here. */
-            printk(KERN_INFO "blktap: set mode to %lx\n", arg);
-            return 0;
-        }
-    case BLKTAP_IOCTL_PRINT_IDXS:
-        {
-            print_be_ring_idxs();
-            print_fe_ring_idxs();
-            WPRINTK("User Rings: \n-----------\n");
-            WPRINTK("UF: rsp_cons: %2d, req_prod_prv: %2d "
-                            "| req_prod: %2d, rsp_prod: %2d\n",
-                            blktap_ufe_ring.rsp_cons,
-                            blktap_ufe_ring.req_prod_pvt,
-                            blktap_ufe_ring.sring->req_prod,
-                            blktap_ufe_ring.sring->rsp_prod);
-            WPRINTK("UB: req_cons: %2d, rsp_prod_prv: %2d "
-                            "| req_prod: %2d, rsp_prod: %2d\n",
-                            blktap_ube_ring.req_cons,
-                            blktap_ube_ring.rsp_prod_pvt,
-                            blktap_ube_ring.sring->req_prod,
-                            blktap_ube_ring.sring->rsp_prod);
-            
-        }
-    }
-    return -ENOIOCTLCMD;
-}
-
-static unsigned int blktap_poll(struct file *file, poll_table *wait)
-{
-        poll_wait(file, &blktap_wait, wait);
-
-        if ( RING_HAS_UNPUSHED_REQUESTS(&blktap_uctrl_ring) ||
-             RING_HAS_UNPUSHED_REQUESTS(&blktap_ufe_ring)   ||
-             RING_HAS_UNPUSHED_RESPONSES(&blktap_ube_ring) ) {
-
-            flush_tlb_all();
-
-            RING_PUSH_REQUESTS(&blktap_uctrl_ring);
-            RING_PUSH_REQUESTS(&blktap_ufe_ring);
-            RING_PUSH_RESPONSES(&blktap_ube_ring);
-            return POLLIN | POLLRDNORM;
-        }
-
-        return 0;
-}
-
-void blktap_kick_user(void)
-{
-    /* blktap_ring->req_prod = blktap_req_prod; */
-    wake_up_interruptible(&blktap_wait);
-}
-
-static struct file_operations blktap_fops = {
-    owner:    THIS_MODULE,
-    poll:     blktap_poll,
-    ioctl:    blktap_ioctl,
-    open:     blktap_open,
-    release:  blktap_release,
-    mmap:     blktap_mmap,
-};
-    
-/*-----[ Data to/from user space ]----------------------------------------*/
-
-static void fast_flush_area(int idx, int nr_pages)
-{
-#ifdef CONFIG_XEN_BLKDEV_GRANT
-    struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
-    unsigned int i, op = 0;
-    struct grant_handle_pair *handle;
-    unsigned long ptep;
-
-    for (i=0; i<nr_pages; i++)
-    {
-        handle = &pending_handle(idx, i);
-        if (!BLKTAP_INVALID_HANDLE(handle))
-        {
-
-            unmap[op].host_addr = MMAP_VADDR(mmap_vstart, idx, i);
-            unmap[op].dev_bus_addr = 0;
-            unmap[op].handle = handle->kernel;
-            op++;
-
-            if (create_lookup_pte_addr(blktap_vma->vm_mm,
-                                       MMAP_VADDR(user_vstart, idx, i), 
-                                       &ptep) !=0) {
-                DPRINTK("Couldn't get a pte addr!\n");
-                return;
-            }
-            unmap[op].host_addr    = ptep;
-            unmap[op].dev_bus_addr = 0;
-            unmap[op].handle       = handle->user;
-            op++;
-            
-            BLKTAP_INVALIDATE_HANDLE(handle);
-        }
-    }
-    if ( unlikely(HYPERVISOR_grant_table_op(
-        GNTTABOP_unmap_grant_ref, unmap, op)))
-        BUG();
-#else
-    multicall_entry_t mcl[BLKIF_MAX_SEGMENTS_PER_REQUEST];
-    int               i;
-
-    for ( i = 0; i < nr_pages; i++ )
-    {
-        MULTI_update_va_mapping(mcl+i, MMAP_VADDR(mmap_vstart, idx, i),
-                                __pte(0), 0);
-    }
-
-    mcl[nr_pages-1].args[MULTI_UVMFLAGS_INDEX] = UVMF_TLB_FLUSH|UVMF_ALL;
-    if ( unlikely(HYPERVISOR_multicall(mcl, nr_pages) != 0) )
-        BUG();
-#endif
-}
-
-
-int blktap_write_fe_ring(blkif_request_t *req)
-{
-    blkif_request_t *target;
-    int i, ret = 0;
-#ifdef CONFIG_XEN_BLKDEV_GRANT
-    struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
-    int op;
-#else
-    unsigned long remap_prot;
-    multicall_entry_t mcl[BLKIF_MAX_SEGMENTS_PER_REQUEST+1];
-    mmu_update_t mmu[BLKIF_MAX_SEGMENTS_PER_REQUEST];
-#endif
-
-    /*
-     * This is called to pass a request from the real frontend domain's
-     * blkif ring to the character device.
-     */
-
-    if ( ! blktap_ring_ok ) {
-        DPRINTK("blktap: ufe_ring not ready for a request!\n");
-        return 0;
-    }
-
-    if ( RING_FULL(&blktap_ufe_ring) ) {
-        PRINTK("blktap: fe_ring is full, can't add.\n");
-        return 0;
-    }
-
-    flush_cache_all(); /* a noop on intel... */
-
-    target = RING_GET_REQUEST(&blktap_ufe_ring, blktap_ufe_ring.req_prod_pvt);
-    memcpy(target, req, sizeof(*req));
-
-    /* Map the foreign pages directly in to the application */
-#ifdef CONFIG_XEN_BLKDEV_GRANT
-    op = 0;
-    for (i=0; i<target->nr_segments; i++) {
-
-        unsigned long uvaddr;
-        unsigned long kvaddr;
-        unsigned long ptep;
-
-        uvaddr = MMAP_VADDR(user_vstart, ID_TO_IDX(req->id), i);
-        kvaddr = MMAP_VADDR(mmap_vstart, ID_TO_IDX(req->id), i);
-
-        /* Map the remote page to kernel. */
-        map[op].host_addr = kvaddr;
-        map[op].dom   = ID_TO_DOM(req->id);
-        map[op].ref   = blkif_gref_from_fas(target->frame_and_sects[i]);
-        map[op].flags = GNTMAP_host_map;
-        /* This needs a bit more thought in terms of interposition: 
-         * If we want to be able to modify pages during write using 
-         * grant table mappings, the guest will either need to allow 
-         * it, or we'll need to incur a copy. */
-        if (req->operation == BLKIF_OP_WRITE)
-            map[op].flags |= GNTMAP_readonly;
-        op++;
-
-        /* Now map it to user. */
-        ret = create_lookup_pte_addr(blktap_vma->vm_mm, uvaddr, &ptep);
-        if (ret)
-        {
-            DPRINTK("Couldn't get a pte addr!\n");
-            goto fail;
-        }
-
-        map[op].host_addr = ptep;
-        map[op].dom       = ID_TO_DOM(req->id);
-        map[op].ref       = blkif_gref_from_fas(target->frame_and_sects[i]);
-        map[op].flags     = GNTMAP_host_map | GNTMAP_application_map
-                            | GNTMAP_contains_pte;
-        /* Above interposition comment applies here as well. */
-        if (req->operation == BLKIF_OP_WRITE)
-            map[op].flags |= GNTMAP_readonly;
-        op++;
-    }
-
-    if ( unlikely(HYPERVISOR_grant_table_op(
-            GNTTABOP_map_grant_ref, map, op)))
-        BUG();
-
-    op = 0;
-    for (i=0; i<(target->nr_segments*2); i+=2) {
-        unsigned long uvaddr;
-        unsigned long kvaddr;
-        unsigned long offset;
-        int cancel = 0;
-
-        uvaddr = MMAP_VADDR(user_vstart, ID_TO_IDX(req->id), i/2);
-        kvaddr = MMAP_VADDR(mmap_vstart, ID_TO_IDX(req->id), i/2);
-
-        if ( unlikely(map[i].handle < 0) ) {
-            DPRINTK("Error on kernel grant mapping (%d)\n", map[i].handle);
-            ret = map[i].handle;
-            cancel = 1;
-        }
-
-        if ( unlikely(map[i+1].handle < 0) ) {
-            DPRINTK("Error on user grant mapping (%d)\n", map[i+1].handle);
-            ret = map[i+1].handle;
-            cancel = 1;
-        }
-
-        if (cancel) 
-            goto fail;
-
-        /* Set the necessary mappings in p2m and in the VM_FOREIGN 
-         * vm_area_struct to allow user vaddr -> struct page lookups
-         * to work.  This is needed for direct IO to foreign pages. */
-        phys_to_machine_mapping[__pa(kvaddr) >> PAGE_SHIFT] =
-            FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT);
-
-        offset = (uvaddr - blktap_vma->vm_start) >> PAGE_SHIFT;
-        ((struct page **)blktap_vma->vm_private_data)[offset] =
-            pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
-
-        /* Save handles for unmapping later. */
-        pending_handle(ID_TO_IDX(req->id), i/2).kernel = map[i].handle;
-        pending_handle(ID_TO_IDX(req->id), i/2).user   = map[i+1].handle;
-    }
-    
-#else
-
-    remap_prot = _PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW;
-
-    for (i=0; i<target->nr_segments; i++) {
-        unsigned long buf;
-        unsigned long uvaddr;
-        unsigned long kvaddr;
-        unsigned long offset;
-        unsigned long ptep;
-
-        buf   = target->frame_and_sects[i] & PAGE_MASK;
-        uvaddr = MMAP_VADDR(user_vstart, ID_TO_IDX(req->id), i);
-        kvaddr = MMAP_VADDR(mmap_vstart, ID_TO_IDX(req->id), i);
-
-        MULTI_update_va_mapping_otherdomain(
-            mcl+i, 
-            kvaddr, 
-            pfn_pte_ma(buf >> PAGE_SHIFT, __pgprot(remap_prot)),
-            0,
-            ID_TO_DOM(req->id));
-
-        phys_to_machine_mapping[__pa(kvaddr)>>PAGE_SHIFT] =
-            FOREIGN_FRAME(buf >> PAGE_SHIFT);
-
-        ret = create_lookup_pte_addr(blktap_vma->vm_mm, uvaddr, &ptep);
-        if (ret)
-        { 
-            DPRINTK("error getting pte\n");
-            goto fail;
-        }
-
-        mmu[i].ptr = ptep;
-        mmu[i].val = (target->frame_and_sects[i] & PAGE_MASK)
-            | pgprot_val(blktap_vma->vm_page_prot);
-
-        offset = (uvaddr - blktap_vma->vm_start) >> PAGE_SHIFT;
-        ((struct page **)blktap_vma->vm_private_data)[offset] =
-            pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
-    }
-    
-    /* Add the mmu_update call. */
-    mcl[i].op = __HYPERVISOR_mmu_update;
-    mcl[i].args[0] = (unsigned long)mmu;
-    mcl[i].args[1] = target->nr_segments;
-    mcl[i].args[2] = 0;
-    mcl[i].args[3] = ID_TO_DOM(req->id);
-
-    BUG_ON(HYPERVISOR_multicall(mcl, target->nr_segments+1) != 0);
-
-    /* Make sure it all worked. */
-    for ( i = 0; i < target->nr_segments; i++ )
-    {
-        if ( unlikely(mcl[i].result != 0) )
-        {
-            DPRINTK("invalid buffer -- could not remap it\n");
-            ret = mcl[i].result;
-            goto fail;
-        }
-    }
-    if ( unlikely(mcl[i].result != 0) )
-    {
-        DPRINTK("direct remapping of pages to /dev/blktap failed.\n");
-        ret = mcl[i].result;
-        goto fail;
-    }
-#endif /* CONFIG_XEN_BLKDEV_GRANT */
-
-    /* Mark mapped pages as reserved: */
-    for ( i = 0; i < target->nr_segments; i++ )
-    {
-        unsigned long kvaddr;
-
-        kvaddr = MMAP_VADDR(mmap_vstart, ID_TO_IDX(req->id), i);
-        SetPageReserved(pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT));
-    }
-
-
-    blktap_ufe_ring.req_prod_pvt++;
-    
-    return 0;
-
- fail:
-    fast_flush_area(ID_TO_IDX(req->id), target->nr_segments);
-    return ret;
-}
-
-int blktap_write_be_ring(blkif_response_t *rsp)
-{
-    blkif_response_t *target;
-
-    /*
-     * This is called to pass a request from the real backend domain's
-     * blkif ring to the character device.
-     */
-
-    if ( ! blktap_ring_ok ) {
-        DPRINTK("blktap: be_ring not ready for a request!\n");
-        return 0;
-    }
-
-    /* No test for fullness in the response direction. */
-
-    target = RING_GET_RESPONSE(&blktap_ube_ring,
-            blktap_ube_ring.rsp_prod_pvt);
-    memcpy(target, rsp, sizeof(*rsp));
-
-    /* no mapping -- pages were mapped in blktap_write_fe_ring() */
-
-    blktap_ube_ring.rsp_prod_pvt++;
-    
-    return 0;
-}
-
-static int blktap_read_fe_ring(void)
-{
-    /* This is called to read responses from the UFE ring. */
-
-    RING_IDX i, j, rp;
-    blkif_response_t *resp_s;
-    blkif_t *blkif;
-    active_req_t *ar;
-
-    DPRINTK("blktap_read_fe_ring()\n");
-
-    /* if we are forwarding from UFERring to FERing */
-    if (blktap_mode & BLKTAP_MODE_INTERCEPT_FE) {
-
-        /* for each outstanding message on the UFEring  */
-        rp = blktap_ufe_ring.sring->rsp_prod;
-        rmb();
-        
-        for ( i = blktap_ufe_ring.rsp_cons; i != rp; i++ )
-        {
-            resp_s = RING_GET_RESPONSE(&blktap_ufe_ring, i);
-            
-            DPRINTK("resp->fe_ring\n");
-            ar = lookup_active_req(ID_TO_IDX(resp_s->id));
-            blkif = ar->blkif;
-            for (j = 0; j < ar->nr_pages; j++) {
-                unsigned long vaddr;
-                struct page **map = blktap_vma->vm_private_data;
-                int offset; 
-
-                vaddr  = MMAP_VADDR(user_vstart, ID_TO_IDX(resp_s->id), j);
-                offset = (vaddr - blktap_vma->vm_start) >> PAGE_SHIFT;
-
-                ClearPageReserved(virt_to_page(vaddr));
-                map[offset] = NULL;
-            }
-
-            fast_flush_area(ID_TO_IDX(resp_s->id), ar->nr_pages);
-            zap_page_range(blktap_vma, 
-                    MMAP_VADDR(user_vstart, ID_TO_IDX(resp_s->id), 0), 
-                    ar->nr_pages << PAGE_SHIFT, NULL);
-            write_resp_to_fe_ring(blkif, resp_s);
-            blktap_ufe_ring.rsp_cons = i + 1;
-            kick_fe_domain(blkif);
-        }
-    }
-    return 0;
-}
-
-static int blktap_read_be_ring(void)
-{
-    /* This is called to read requests from the UBE ring. */
-
-    RING_IDX i, rp;
-    blkif_request_t *req_s;
-
-    DPRINTK("blktap_read_be_ring()\n");
-
-    /* if we are forwarding from UFERring to FERing */
-    if (blktap_mode & BLKTAP_MODE_INTERCEPT_BE) {
-
-        /* for each outstanding message on the UFEring  */
-        rp = blktap_ube_ring.sring->req_prod;
-        rmb();
-        for ( i = blktap_ube_ring.req_cons; i != rp; i++ )
-        {
-            req_s = RING_GET_REQUEST(&blktap_ube_ring, i);
-
-            DPRINTK("req->be_ring\n");
-            write_req_to_be_ring(req_s);
-            kick_be_domain();
-        }
-        
-        blktap_ube_ring.req_cons = i;
-    }
-
-    return 0;
-}
-
-int blktap_write_ctrl_ring(ctrl_msg_t *msg)
-{
-    ctrl_msg_t *target;
-
-    if ( ! blktap_ring_ok ) {
-        DPRINTK("blktap: be_ring not ready for a request!\n");
-        return 0;
-    }
-
-    /* No test for fullness in the response direction. */
-
-    target = RING_GET_REQUEST(&blktap_uctrl_ring,
-            blktap_uctrl_ring.req_prod_pvt);
-    memcpy(target, msg, sizeof(*msg));
-
-    blktap_uctrl_ring.req_prod_pvt++;
-    
-    /* currently treat the ring as unidirectional. */
-    blktap_uctrl_ring.rsp_cons = blktap_uctrl_ring.sring->rsp_prod;
-    
-    return 0;
-       
-}
-
-/* -------[ blktap module setup ]------------------------------------- */
-
-static struct miscdevice blktap_miscdev = {
-    .minor        = BLKTAP_MINOR,
-    .name         = "blktap",
-    .fops         = &blktap_fops,
-    .devfs_name   = "misc/blktap",
-};
-
-int blktap_init(void)
-{
-    int err, i, j;
-    struct page *page;
-
-    page = balloon_alloc_empty_page_range(MMAP_PAGES);
-    BUG_ON(page == NULL);
-    mmap_vstart = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
-
-#ifdef CONFIG_XEN_BLKDEV_GRANT
-    for (i=0; i<MAX_PENDING_REQS ; i++)
-        for (j=0; j<BLKIF_MAX_SEGMENTS_PER_REQUEST; j++)
-            BLKTAP_INVALIDATE_HANDLE(&pending_handle(i, j));
-#endif
-
-    err = misc_register(&blktap_miscdev);
-    if ( err != 0 )
-    {
-        printk(KERN_ALERT "Couldn't register /dev/misc/blktap (%d)\n", err);
-        return err;
-    }
-
-    init_waitqueue_head(&blktap_wait);
-
-
-    return 0;
-}
diff --git a/linux-2.6-xen-sparse/drivers/xen/blktap/common.h b/linux-2.6-xen-sparse/drivers/xen/blktap/common.h
new file mode 100644 (file)
index 0000000..3239a17
--- /dev/null
@@ -0,0 +1,112 @@
+
+#ifndef __BLKIF__BACKEND__COMMON_H__
+#define __BLKIF__BACKEND__COMMON_H__
+
+#include <linux/config.h>
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/vmalloc.h>
+#include <asm/io.h>
+#include <asm/setup.h>
+#include <asm/pgalloc.h>
+#include <asm-xen/evtchn.h>
+#include <asm-xen/hypervisor.h>
+#include <asm-xen/xen-public/io/blkif.h>
+#include <asm-xen/xen-public/io/ring.h>
+#include <asm-xen/gnttab.h>
+
+#if 0
+#define ASSERT(_p) \
+    if ( !(_p) ) { printk("Assertion '%s' failed, line %d, file %s", #_p , \
+    __LINE__, __FILE__); *(int*)0=0; }
+#define DPRINTK(_f, _a...) printk(KERN_ALERT "(file=%s, line=%d) " _f, \
+                           __FILE__ , __LINE__ , ## _a )
+#else
+#define ASSERT(_p) ((void)0)
+#define DPRINTK(_f, _a...) ((void)0)
+#endif
+
+#define WPRINTK(fmt, args...) printk(KERN_WARNING "blk_tap: " fmt, ##args)
+
+struct vbd {
+    blkif_vdev_t   handle;      /* what the domain refers to this vbd as */
+    unsigned char  readonly;    /* Non-zero -> read-only */
+    unsigned char  type;        /* VDISK_xxx */
+    blkif_pdev_t   pdevice;     /* phys device that this vbd maps to */
+    struct block_device *bdev;
+}; 
+
+typedef struct blkif_st {
+    /* Unique identifier for this interface. */
+    domid_t           domid;
+    unsigned int      handle;
+    /* Physical parameters of the comms window. */
+    unsigned long     shmem_frame;
+    unsigned int      evtchn;
+    unsigned int      remote_evtchn;
+    /* Comms information. */
+    blkif_back_ring_t blk_ring;
+    /* VBDs attached to this interface. */
+    struct vbd        vbd;
+    /* Private fields. */
+    enum { DISCONNECTED, CONNECTED } status;
+#ifdef CONFIG_XEN_BLKDEV_TAP_BE
+    /* Is this a blktap frontend */
+    unsigned int     is_blktap;
+#endif
+    struct list_head blkdev_list;
+    spinlock_t       blk_ring_lock;
+    atomic_t         refcnt;
+
+    struct work_struct free_work;
+    u16 shmem_handle;
+    unsigned long shmem_vaddr;
+    grant_ref_t shmem_ref;
+} blkif_t;
+
+void blkif_create(blkif_be_create_t *create);
+void blkif_destroy(blkif_be_destroy_t *destroy);
+void blkif_connect(blkif_be_connect_t *connect);
+int  blkif_disconnect(blkif_be_disconnect_t *disconnect, u8 rsp_id);
+void blkif_disconnect_complete(blkif_t *blkif);
+blkif_t *alloc_blkif(domid_t domid);
+void free_blkif_callback(blkif_t *blkif);
+int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn);
+
+#define blkif_get(_b) (atomic_inc(&(_b)->refcnt))
+#define blkif_put(_b)                             \
+    do {                                          \
+        if ( atomic_dec_and_test(&(_b)->refcnt) ) \
+            free_blkif_callback(_b);             \
+    } while (0)
+
+/* Create a vbd. */
+int vbd_create(blkif_t *blkif, blkif_vdev_t vdevice, blkif_pdev_t pdevice,
+              int readonly);
+void vbd_free(struct vbd *vbd);
+
+unsigned long vbd_size(struct vbd *vbd);
+unsigned int vbd_info(struct vbd *vbd);
+unsigned long vbd_secsize(struct vbd *vbd);
+
+struct phys_req {
+    unsigned short       dev;
+    unsigned short       nr_sects;
+    struct block_device *bdev;
+    blkif_sector_t       sector_number;
+};
+
+int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation); 
+
+void blkif_interface_init(void);
+
+void blkif_deschedule(blkif_t *blkif);
+
+void blkif_xenbus_init(void);
+
+irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs);
+
+#endif /* __BLKIF__BACKEND__COMMON_H__ */
diff --git a/linux-2.6-xen-sparse/drivers/xen/blktap/interface.c b/linux-2.6-xen-sparse/drivers/xen/blktap/interface.c
new file mode 100644 (file)
index 0000000..6f9aae9
--- /dev/null
@@ -0,0 +1,141 @@
+/******************************************************************************
+ * arch/xen/drivers/blkif/backend/interface.c
+ * 
+ * Block-device interface management.
+ * 
+ * Copyright (c) 2004, Keir Fraser
+ */
+
+#include "common.h"
+#include <asm-xen/evtchn.h>
+
+static kmem_cache_t *blkif_cachep;
+
+blkif_t *alloc_blkif(domid_t domid)
+{
+    blkif_t *blkif;
+
+    blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL);
+    if (!blkif)
+           return ERR_PTR(-ENOMEM);
+
+    memset(blkif, 0, sizeof(*blkif));
+    blkif->domid = domid;
+    blkif->status = DISCONNECTED;
+    spin_lock_init(&blkif->blk_ring_lock);
+    atomic_set(&blkif->refcnt, 1);
+
+    return blkif;
+}
+
+static int map_frontend_page(blkif_t *blkif, unsigned long localaddr,
+                            unsigned long shared_page)
+{
+    struct gnttab_map_grant_ref op;
+    op.host_addr = localaddr;
+    op.flags = GNTMAP_host_map;
+    op.ref = shared_page;
+    op.dom = blkif->domid;
+
+    BUG_ON( HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1) );
+
+    if (op.handle < 0) {
+       DPRINTK(" Grant table operation failure !\n");
+       return op.handle;
+    }
+
+    blkif->shmem_ref = shared_page;
+    blkif->shmem_handle = op.handle;
+    blkif->shmem_vaddr = localaddr;
+    return 0;
+}
+
+static void unmap_frontend_page(blkif_t *blkif)
+{
+    struct gnttab_unmap_grant_ref op;
+
+    op.host_addr = blkif->shmem_vaddr;
+    op.handle = blkif->shmem_handle;
+    op.dev_bus_addr = 0;
+    BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1));
+}
+
+int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn)
+{
+    struct vm_struct *vma;
+    blkif_sring_t *sring;
+    evtchn_op_t op = { .cmd = EVTCHNOP_bind_interdomain };
+    int err;
+
+    BUG_ON(blkif->remote_evtchn);
+
+    if ( (vma = get_vm_area(PAGE_SIZE, VM_IOREMAP)) == NULL )
+       return -ENOMEM;
+
+    err = map_frontend_page(blkif, (unsigned long)vma->addr, shared_page);
+    if (err) {
+        vfree(vma->addr);
+       return err;
+    }
+
+    op.u.bind_interdomain.dom1 = DOMID_SELF;
+    op.u.bind_interdomain.dom2 = blkif->domid;
+    op.u.bind_interdomain.port1 = 0;
+    op.u.bind_interdomain.port2 = evtchn;
+    err = HYPERVISOR_event_channel_op(&op);
+    if (err) {
+       unmap_frontend_page(blkif);
+       vfree(vma->addr);
+       return err;
+    }
+
+    blkif->evtchn = op.u.bind_interdomain.port1;
+    blkif->remote_evtchn = evtchn;
+
+    sring = (blkif_sring_t *)vma->addr;
+    SHARED_RING_INIT(sring);
+    BACK_RING_INIT(&blkif->blk_ring, sring, PAGE_SIZE);
+
+    bind_evtchn_to_irqhandler(blkif->evtchn, blkif_be_int, 0, "blkif-backend",
+                             blkif);
+    blkif->status        = CONNECTED;
+    blkif->shmem_frame   = shared_page;
+
+    return 0;
+}
+
+static void free_blkif(void *arg)
+{
+    evtchn_op_t op = { .cmd = EVTCHNOP_close };
+    blkif_t *blkif = (blkif_t *)arg;
+
+    op.u.close.port = blkif->evtchn;
+    op.u.close.dom = DOMID_SELF;
+    HYPERVISOR_event_channel_op(&op);
+    op.u.close.port = blkif->remote_evtchn;
+    op.u.close.dom = blkif->domid;
+    HYPERVISOR_event_channel_op(&op);
+
+    if (blkif->evtchn)
+        unbind_evtchn_from_irqhandler(blkif->evtchn, blkif);
+
+    if (blkif->blk_ring.sring) {
+       unmap_frontend_page(blkif);
+       vfree(blkif->blk_ring.sring);
+       blkif->blk_ring.sring = NULL;
+    }
+
+    kmem_cache_free(blkif_cachep, blkif);
+}
+
+void free_blkif_callback(blkif_t *blkif)
+{
+    INIT_WORK(&blkif->free_work, free_blkif, (void *)blkif);
+    schedule_work(&blkif->free_work);
+}
+
+void __init blkif_interface_init(void)
+{
+    blkif_cachep = kmem_cache_create("blkif_cache", sizeof(blkif_t), 
+                                     0, 0, NULL, NULL);
+}
diff --git a/linux-2.6-xen-sparse/drivers/xen/blktap/xenbus.c b/linux-2.6-xen-sparse/drivers/xen/blktap/xenbus.c
new file mode 100644 (file)
index 0000000..06d07ca
--- /dev/null
@@ -0,0 +1,225 @@
+/*  Xenbus code for blkif tap
+
+    A Warfield.
+
+    Hastily modified from the oroginal backend code:
+
+    Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+
+#include <stdarg.h>
+#include <linux/module.h>
+#include <asm-xen/xenbus.h>
+#include "common.h"
+
+struct backend_info
+{
+       struct xenbus_device *dev;
+
+       /* our communications channel */
+       blkif_t *blkif;
+
+       long int frontend_id;
+
+       /* watch back end for changes */
+       struct xenbus_watch backend_watch;
+
+       /* watch front end for changes */
+       struct xenbus_watch watch;
+       char *frontpath;
+};
+
+static int blkback_remove(struct xenbus_device *dev)
+{
+       struct backend_info *be = dev->data;
+
+       if (be->watch.node)
+               unregister_xenbus_watch(&be->watch);
+       unregister_xenbus_watch(&be->backend_watch);
+       if (be->blkif)
+               blkif_put(be->blkif);
+       if (be->frontpath)
+               kfree(be->frontpath);
+       kfree(be);
+       return 0;
+}
+
+/* Front end tells us frame. */
+static void frontend_changed(struct xenbus_watch *watch, const char *node)
+{
+       unsigned long ring_ref;
+       unsigned int evtchn;
+       int err;
+       struct backend_info *be
+               = container_of(watch, struct backend_info, watch);
+
+       /* If other end is gone, delete ourself. */
+       if (node && !xenbus_exists(be->frontpath, "")) {
+               xenbus_rm(be->dev->nodename, "");
+               device_unregister(&be->dev->dev);
+               return;
+       }
+       if (be->blkif == NULL || be->blkif->status == CONNECTED)
+               return;
+
+       err = xenbus_gather(be->frontpath, "ring-ref", "%lu", &ring_ref,
+                           "event-channel", "%u", &evtchn, NULL);
+       if (err) {
+               xenbus_dev_error(be->dev, err,
+                                "reading %s/ring-ref and event-channel",
+                                be->frontpath);
+               return;
+       }
+
+       /* Map the shared frame, irq etc. */
+       err = blkif_map(be->blkif, ring_ref, evtchn);
+       if (err) {
+               xenbus_dev_error(be->dev, err, "mapping ring-ref %lu port %u",
+                                ring_ref, evtchn);
+               goto abort;
+       }
+
+       xenbus_dev_ok(be->dev);
+
+       return;
+
+abort:
+       xenbus_transaction_end(1);
+}
+
+/* 
+   Setup supplies physical device.  
+   We provide event channel and device details to front end.
+   Frontend supplies shared frame and event channel.
+ */
+static void backend_changed(struct xenbus_watch *watch, const char *node)
+{
+       int err;
+       char *p;
+       long int handle;
+       struct backend_info *be
+               = container_of(watch, struct backend_info, backend_watch);
+       struct xenbus_device *dev = be->dev;
+
+       if (be->blkif == NULL) {
+               /* Front end dir is a number, which is used as the handle. */
+               p = strrchr(be->frontpath, '/') + 1;
+               handle = simple_strtoul(p, NULL, 0);
+
+               be->blkif = alloc_blkif(be->frontend_id);
+               if (IS_ERR(be->blkif)) {
+                       err = PTR_ERR(be->blkif);
+                       be->blkif = NULL;
+                       xenbus_dev_error(dev, err, "creating block interface");
+                       return;
+               }
+
+               /* Pass in NULL node to skip exist test. */
+               frontend_changed(&be->watch, NULL);
+       }
+}
+
+static int blkback_probe(struct xenbus_device *dev,
+                        const struct xenbus_device_id *id)
+{
+       struct backend_info *be;
+       char *frontend;
+       int err;
+
+       be = kmalloc(sizeof(*be), GFP_KERNEL);
+       if (!be) {
+               xenbus_dev_error(dev, -ENOMEM, "allocating backend structure");
+               return -ENOMEM;
+       }
+       memset(be, 0, sizeof(*be));
+
+       frontend = NULL;
+       err = xenbus_gather(dev->nodename,
+                           "frontend-id", "%li", &be->frontend_id,
+                           "frontend", NULL, &frontend,
+                           NULL);
+       if (XENBUS_EXIST_ERR(err))
+               goto free_be;
+       if (err < 0) {
+               xenbus_dev_error(dev, err,
+                                "reading %s/frontend or frontend-id",
+                                dev->nodename);
+               goto free_be;
+       }
+       if (strlen(frontend) == 0 || !xenbus_exists(frontend, "")) {
+               /* If we can't get a frontend path and a frontend-id,
+                * then our bus-id is no longer valid and we need to
+                * destroy the backend device.
+                */
+               err = -ENOENT;
+               goto free_be;
+       }
+
+       be->dev = dev;
+       be->backend_watch.node = dev->nodename;
+       be->backend_watch.callback = backend_changed;
+       err = register_xenbus_watch(&be->backend_watch);
+       if (err) {
+               be->backend_watch.node = NULL;
+               xenbus_dev_error(dev, err, "adding backend watch on %s",
+                                dev->nodename);
+               goto free_be;
+       }
+
+       be->frontpath = frontend;
+       be->watch.node = be->frontpath;
+       be->watch.callback = frontend_changed;
+       err = register_xenbus_watch(&be->watch);
+       if (err) {
+               be->watch.node = NULL;
+               xenbus_dev_error(dev, err,
+                                "adding frontend watch on %s",
+                                be->frontpath);
+               goto free_be;
+       }
+
+       dev->data = be;
+
+       backend_changed(&be->backend_watch, dev->nodename);
+       return 0;
+
+ free_be:
+       if (be->backend_watch.node)
+               unregister_xenbus_watch(&be->backend_watch);
+       if (frontend)
+               kfree(frontend);
+       kfree(be);
+       return err;
+}
+
+static struct xenbus_device_id blkback_ids[] = {
+       { "vbd" },
+       { "" }
+};
+
+static struct xenbus_driver blkback = {
+       .name = "vbd",
+       .owner = THIS_MODULE,
+       .ids = blkback_ids,
+       .probe = blkback_probe,
+       .remove = blkback_remove,
+};
+
+void blkif_xenbus_init(void)
+{
+       xenbus_register_backend(&blkback);
+}
index e5d776921e5f3d10036a0dbcb0e5d4ef9a65076f..e3e1451ea062374b083f66d962f8671ad468b8ad 100644 (file)
@@ -954,10 +954,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                         i++;
                         start += PAGE_SIZE;
                         len--;
-printk(KERN_ALERT "HIT  0x%lx\n", start);
                         continue;
                     } 
-else printk(KERN_ALERT "MISS 0x%lx\n", start);
                 }
 
                if (!vma || (vma->vm_flags & VM_IO)
index 0fe52162124e9a2f66734187133210dbd902a3d0..fab85d1adc3ac7ac2a1ad74142e2b318d2411a5a 100644 (file)
@@ -6,7 +6,8 @@ XEN_ROOT = ../..
 include $(XEN_ROOT)/tools/Rules.mk
 
 SUBDIRS :=
-SUBDIRS += parallax
+SUBDIRS += ublkback
+#SUBDIRS += parallax
 
 BLKTAP_INSTALL_DIR = /usr/sbin
 
@@ -14,12 +15,12 @@ INSTALL            = install
 INSTALL_PROG       = $(INSTALL) -m0755
 INSTALL_DIR        = $(INSTALL) -d -m0755
 
-INCLUDES += -I. -I $(XEN_LIBXC)
+INCLUDES += -I. -I $(XEN_LIBXC) -I $(XEN_XENSTORE)
 
 LIBS     := -lpthread -lz
 
 SRCS     :=
-SRCS     += blktaplib.c
+SRCS     += blktaplib.c xenbus.c blkif.c
 
 CFLAGS   += -Wall
 CFLAGS   += -Werror
@@ -28,17 +29,20 @@ CFLAGS   += -Wno-unused
 CFLAGS   += -g3
 CFLAGS   += -fno-strict-aliasing
 CFLAGS   += -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE
+# get asprintf():
+CFLAGS   += -D _GNU_SOURCE
 # Get gcc to generate the dependencies for us.
 CFLAGS   += -Wp,-MD,.$(@F).d
 CFLAGS   += $(INCLUDES) 
 DEPS     = .*.d
 
 OBJS     = $(patsubst %.c,%.o,$(SRCS))
-IBINS    = blkdump
+IBINS   :=
+#IBINS   += blkdump
 
 LIB      = libblktap.so libblktap.so.$(MAJOR) libblktap.so.$(MAJOR).$(MINOR)
 
-all: mk-symlinks libblktap.so blkdump
+all: mk-symlinks libblktap.so #blkdump
        @set -e; for subdir in $(SUBDIRS); do \
                $(MAKE) -C $$subdir $@;       \
        done
@@ -59,7 +63,7 @@ install: all
        $(INSTALL_DIR) -p $(DESTDIR)/usr/include
        $(INSTALL_PROG) $(LIB) $(DESTDIR)/usr/$(LIBDIR)
        $(INSTALL_PROG) blktaplib.h $(DESTDIR)/usr/include
-       $(INSTALL_PROG) $(IBINS) $(DESTDIR)$(BLKTAP_INSTALL_DIR)
+       #$(INSTALL_PROG) $(IBINS) $(DESTDIR)$(BLKTAP_INSTALL_DIR)
        @set -e; for subdir in $(SUBDIRS); do \
                $(MAKE) -C $$subdir $@;       \
        done
@@ -79,14 +83,16 @@ rpm: all
        mv staging/i386/*.rpm .
        rm -rf staging
 
-libblktap.so: $(OBJS)
-       $(CC) $(CFLAGS) -Wl,-soname -Wl,$(SONAME) -shared -o      \
-             libblktap.so.$(MAJOR).$(MINOR) $^ $(LIBS)
+libblktap.so: $(OBJS) 
+       $(CC) $(CFLAGS) -Wl,-soname -Wl,$(SONAME) -shared         \
+             -L$(XEN_XENSTORE) -l xenstore                       \
+             -o libblktap.so.$(MAJOR).$(MINOR) $^ $(LIBS)
        ln -sf libblktap.so.$(MAJOR).$(MINOR) libblktap.so.$(MAJOR)
        ln -sf libblktap.so.$(MAJOR) $@
 
 blkdump: libblktap.so
-       $(CC) $(CFLAGS) -o blkdump -L$(XEN_LIBXC) -L. -l blktap blkdump.c
+       $(CC) $(CFLAGS) -o blkdump -L$(XEN_LIBXC) -L. \
+             -l blktap blkdump.c
 
 .PHONY: TAGS clean install mk-symlinks rpm
 
diff --git a/tools/blktap/README.sept05 b/tools/blktap/README.sept05
new file mode 100644 (file)
index 0000000..e0b9d57
--- /dev/null
@@ -0,0 +1,33 @@
+The blktap has been rewritten substantially based on the current
+blkback driver.  I've removed passthrough support, as this is broken
+by the move to grant tables and the lack of transitive grants.  A
+blktap VM is now only capable of terminating block requests in
+userspace.
+
+ublkback/ contains a _very_ initial cut at a user-level version of the block
+backend driver.  It gives a working example of how the current tap
+interfaces are used, in particular w.r.t. the vbd directories in
+xenstore.
+
+parallax/ contains fairly recent parallax code.  This does not run on
+the changed blktap interface, but should only be a couple of hours
+work to get going again.
+
+All of the tricky bits are done, but there is plenty of cleaning to
+do, and the top-level functionality is not here yet.  At the moment,
+the daemon ignores the pdev requested by the tools and opens the file 
+or device specified by TMP_IMAGE_FILE_NAME in ublkback.c.
+
+TODO:
+1. Fix to allow pdev in the store to specify the device to open.
+2. Add support (to tools as well) to mount arbitrary files...
+   just write the filename to mount into the store, instead of pdev.
+3. Reeximine blkif refcounting, it is almost certainly broken at the moment.
+   - creating a blkif should take a reference.
+   - each inflight request should take a reference on dequeue in blktaplib
+   - sending responses should drop refs.
+   - blkif should be implicitly freed when refcounts fall to 0.
+4. Modify the parallax req/rsp code as per ublkback to use the new tap 
+   interfaces. 
+5. Write a front end that allows parallax and normal mounts to coexist
+6. Allow blkback and blktap to run at the same time.
index 0cf087ff0237c057ae14db4db55c5e49d897423b..3ba8d1add75842d490bb38b22973d61bc3368206 100644 (file)
@@ -8,85 +8,18 @@
 #include <stdio.h>
 #include "blktaplib.h"
  
-int control_print(control_msg_t *msg)
-{
-    if (msg->type != CMSG_BLKIF_BE) 
-    {
-        printf("***\nUNEXPECTED CTRL MSG MAJOR TYPE(%d)\n***\n", msg->type);
-        return 0;
-    }
-    
-    switch(msg->subtype)
-    {
-    case CMSG_BLKIF_BE_CREATE:
-        if ( msg->length != sizeof(blkif_be_create_t) )
-            goto parse_error;
-        printf("[CONTROL_MSG] CMSG_BLKIF_BE_CREATE(d:%d,h:%d)\n",
-                ((blkif_be_create_t *)msg->msg)->domid,
-                ((blkif_be_create_t *)msg->msg)->blkif_handle);
-        break; 
-    case CMSG_BLKIF_BE_DESTROY:
-        if ( msg->length != sizeof(blkif_be_destroy_t) )
-            goto parse_error;
-        printf("[CONTROL_MSG] CMSG_BLKIF_BE_DESTROY(d:%d,h:%d)\n",
-                ((blkif_be_destroy_t *)msg->msg)->domid,
-                ((blkif_be_destroy_t *)msg->msg)->blkif_handle);
-        break;   
-    case CMSG_BLKIF_BE_CONNECT:
-        if ( msg->length != sizeof(blkif_be_connect_t) )
-            goto parse_error;
-        printf("[CONTROL_MSG] CMSG_BLKIF_BE_CONNECT(d:%d,h:%d)\n",
-                ((blkif_be_connect_t *)msg->msg)->domid,
-                ((blkif_be_connect_t *)msg->msg)->blkif_handle);
-        break;        
-    case CMSG_BLKIF_BE_DISCONNECT:
-        if ( msg->length != sizeof(blkif_be_disconnect_t) )
-            goto parse_error;
-        printf("[CONTROL_MSG] CMSG_BLKIF_BE_DISCONNECT(d:%d,h:%d)\n",
-                ((blkif_be_disconnect_t *)msg->msg)->domid,
-                ((blkif_be_disconnect_t *)msg->msg)->blkif_handle);
-        break;     
-    case CMSG_BLKIF_BE_VBD_CREATE:
-        if ( msg->length != sizeof(blkif_be_vbd_create_t) )
-            goto parse_error;
-        printf("[CONTROL_MSG] CMSG_BLKIF_BE_VBD_CREATE(d:%d,h:%d,v:%d)\n",
-                ((blkif_be_vbd_create_t *)msg->msg)->domid,
-                ((blkif_be_vbd_create_t *)msg->msg)->blkif_handle,
-                ((blkif_be_vbd_create_t *)msg->msg)->vdevice);
-        break;
-    case CMSG_BLKIF_BE_VBD_DESTROY:
-        if ( msg->length != sizeof(blkif_be_vbd_destroy_t) )
-            goto parse_error;
-        printf("[CONTROL_MSG] CMSG_BLKIF_BE_VBD_DESTROY(d:%d,h:%d,v:%d)\n",
-                ((blkif_be_vbd_destroy_t *)msg->msg)->domid,
-                ((blkif_be_vbd_destroy_t *)msg->msg)->blkif_handle,
-                ((blkif_be_vbd_destroy_t *)msg->msg)->vdevice);
-        break;
-    default:
-        goto parse_error;
-    }
-   
-    return 0; 
-      
-parse_error:
-    printf("[CONTROL_MSG] Bad message type or length!\n");
-    return 0;
-}
 int request_print(blkif_request_t *req)
 {
     int i;
     unsigned long fas;
     
-    if ( req->operation == BLKIF_OP_PROBE ) {
-        printf("[%2u:%2u<%s]\n", ID_TO_DOM(req->id), ID_TO_IDX(req->id),
-                blkif_op_name[req->operation]);
-        return BLKTAP_PASS;
-    } else {
+    if ( (req->operation == BLKIF_OP_READ) ||
+         (req->operation == BLKIF_OP_WRITE) )
+    {
         printf("[%2u:%2u<%5s] (nr_segs: %03u, dev: %03u, %010llu)\n", 
                 ID_TO_DOM(req->id), ID_TO_IDX(req->id), 
                 blkif_op_name[req->operation], 
-                req->nr_segments, req->device, 
+                req->nr_segments, req->handle, 
                 req->sector_number);
         
         
@@ -99,6 +32,8 @@ int request_print(blkif_request_t *req)
                     );
         }
             
+    } else {
+        printf("Unknown request message type.\n");
     }
     
     return BLKTAP_PASS;
@@ -106,23 +41,22 @@ int request_print(blkif_request_t *req)
 
 int response_print(blkif_response_t *rsp)
 {   
-    if ( rsp->operation == BLKIF_OP_PROBE ) {
-        printf("[%2u:%2u>%s]\n", ID_TO_DOM(rsp->id), ID_TO_IDX(rsp->id),
-                blkif_op_name[rsp->operation]);
-        return BLKTAP_PASS;
-    } else {
+    if ( (rsp->operation == BLKIF_OP_READ) ||
+         (rsp->operation == BLKIF_OP_WRITE) )
+    {
         printf("[%2u:%2u>%5s] (status: %d)\n", 
                 ID_TO_DOM(rsp->id), ID_TO_IDX(rsp->id), 
                 blkif_op_name[rsp->operation], 
                 rsp->status);
             
+    } else {
+        printf("Unknown request message type.\n");
     }
     return BLKTAP_PASS;
 }
 
 int main(int argc, char *argv[])
 {
-    blktap_register_ctrl_hook("control_print", control_print);
     blktap_register_request_hook("request_print", request_print);
     blktap_register_response_hook("response_print", response_print);
     blktap_listen();
diff --git a/tools/blktap/blkif.c b/tools/blktap/blkif.c
new file mode 100644 (file)
index 0000000..f4f386c
--- /dev/null
@@ -0,0 +1,213 @@
+/*
+ * blkif.c
+ * 
+ * The blkif interface for blktap.  A blkif describes an in-use virtual disk.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <string.h>
+#include <err.h>
+
+#include "blktaplib.h"
+
+#if 1
+#define DPRINTF(_f, _a...) printf ( _f , ## _a )
+#else
+#define DPRINTF(_f, _a...) ((void)0)
+#endif
+
+#define BLKIF_HASHSZ 1024
+#define BLKIF_HASH(_d,_h) (((int)(_d)^(int)(_h))&(BLKIF_HASHSZ-1))
+
+static blkif_t      *blkif_hash[BLKIF_HASHSZ];
+
+blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle)
+{
+    blkif_t *blkif = blkif_hash[BLKIF_HASH(domid, handle)];
+    while ( (blkif != NULL) && 
+            ((blkif->domid != domid) || (blkif->handle != handle)) )
+        blkif = blkif->hash_next;
+    return blkif;
+}
+
+blkif_t *alloc_blkif(domid_t domid)
+{
+    blkif_t *blkif;
+
+    blkif = (blkif_t *)malloc(sizeof(blkif_t));
+    if (!blkif)
+        return NULL;
+
+    memset(blkif, 0, sizeof(*blkif));
+    blkif->domid = domid;
+
+    return blkif;
+}
+
+static int (*new_blkif_hook)(blkif_t *blkif) = NULL;
+void register_new_blkif_hook(int (*fn)(blkif_t *blkif))
+{
+    new_blkif_hook = fn;
+}
+
+int blkif_init(blkif_t *blkif, long int handle, long int pdev, 
+               long int readonly)
+{
+    domid_t domid;
+    blkif_t **pblkif;
+    
+    if (blkif == NULL)
+        return -EINVAL;
+
+    domid = blkif->domid;
+    blkif->handle   = handle;
+    blkif->pdev     = pdev;
+    blkif->readonly = readonly;
+
+    /*
+     * Call out to the new_blkif_hook. The tap application should define this,
+     * and it should return having set blkif->ops
+     * 
+     */
+    if (new_blkif_hook == NULL)
+    {
+        warn("Probe detected a new blkif, but no new_blkif_hook!");
+        return -1;
+    }
+    new_blkif_hook(blkif);
+
+    /* Now wire it in. */
+    pblkif = &blkif_hash[BLKIF_HASH(domid, handle)];
+    while ( *pblkif != NULL )
+    {
+        if ( ((*pblkif)->domid == domid) && ((*pblkif)->handle == handle) )
+        {
+            DPRINTF("Could not create blkif: already exists\n");
+            return -1;
+        }
+        pblkif = &(*pblkif)->hash_next;
+    }
+    blkif->hash_next = NULL;
+    *pblkif = blkif;
+
+    return 0;
+}
+
+void free_blkif(blkif_t *blkif)
+{
+    blkif_t **pblkif, *curs;
+    
+    pblkif = &blkif_hash[BLKIF_HASH(blkif->domid, blkif->handle)];
+    while ( (curs = *pblkif) != NULL )
+    {
+        if ( blkif == curs )
+        {
+            *pblkif = curs->hash_next;
+        }
+        pblkif = &curs->hash_next;
+    }
+    if (blkif != NULL)
+        free(blkif);
+}
+
+void blkif_register_request_hook(blkif_t *blkif, char *name, 
+                                 int (*rh)(blkif_t *, blkif_request_t *, int)) 
+{
+    request_hook_t *rh_ent, **c;
+    
+    rh_ent = (request_hook_t *)malloc(sizeof(request_hook_t));
+    if (!rh_ent) 
+    {
+        warn("couldn't allocate a new hook");
+        return;
+    }
+    
+    rh_ent->func  = rh;
+    rh_ent->next = NULL;
+    if (asprintf(&rh_ent->name, "%s", name) == -1)
+    {
+        free(rh_ent);
+        warn("couldn't allocate a new hook name");
+        return;
+    }
+    
+    c = &blkif->request_hook_chain;
+    while (*c != NULL) {
+        c = &(*c)->next;
+    }
+    *c = rh_ent;
+}
+
+void blkif_register_response_hook(blkif_t *blkif, char *name, 
+                                  int (*rh)(blkif_t *, blkif_response_t *, int)) 
+{
+    response_hook_t *rh_ent, **c;
+    
+    rh_ent = (response_hook_t *)malloc(sizeof(response_hook_t));
+    if (!rh_ent) 
+    { 
+        warn("couldn't allocate a new hook");
+        return;
+    }
+    
+    rh_ent->func  = rh;
+    rh_ent->next = NULL;
+    if (asprintf(&rh_ent->name, "%s", name) == -1)
+    {
+        free(rh_ent);
+        warn("couldn't allocate a new hook name");
+        return;
+    }
+    
+    c = &blkif->response_hook_chain;
+    while (*c != NULL) {
+        c = &(*c)->next;
+    }
+    *c = rh_ent;
+}
+
+void blkif_print_hooks(blkif_t *blkif)
+{
+    request_hook_t  *req_hook;
+    response_hook_t *rsp_hook;
+    
+    DPRINTF("Request Hooks:\n");
+    req_hook = blkif->request_hook_chain;
+    while (req_hook != NULL)
+    {
+        DPRINTF("  [0x%p] %s\n", req_hook->func, req_hook->name);
+        req_hook = req_hook->next;
+    }
+    
+    DPRINTF("Response Hooks:\n");
+    rsp_hook = blkif->response_hook_chain;
+    while (rsp_hook != NULL)
+    {
+        DPRINTF("  [0x%p] %s\n", rsp_hook->func, rsp_hook->name);
+        rsp_hook = rsp_hook->next;
+    }
+}
+
+
+long int vbd_size(blkif_t *blkif)
+{
+    return 1000000000;
+}
+
+long int vbd_secsize(blkif_t *blkif)
+{
+    return 512;
+}
+
+unsigned vbd_info(blkif_t *blkif)
+{
+    return 0;
+}
+
+
+void __init_blkif(void)
+{    
+    memset(blkif_hash, 0, sizeof(blkif_hash));
+}
index 6f60ca63242ea8e39f057e009443c906538af6a1..fbac562127e20a3efe956607c8730a1131ead251 100644 (file)
@@ -24,7 +24,7 @@
 #include <string.h>
 #include <unistd.h>
 #include <pthread.h>
-
+#include <xs.h>
                                                                      
 #define __COMPILING_BLKTAP_LIB
 #include "blktaplib.h"
 #else
 #define DPRINTF(_f, _a...) ((void)0)
 #endif
-#define DEBUG_RING_IDXS 1
+#define DEBUG_RING_IDXS 0
 
 #define POLLRDNORM     0x040 
 
 #define BLKTAP_IOCTL_KICK 1
 
+
 void got_sig_bus();
 void got_sig_int();
 
 /* in kernel these are opposite, but we are a consumer now. */
 blkif_back_ring_t  fe_ring; /* slightly counterintuitive ;) */
 blkif_front_ring_t be_ring; 
-ctrl_back_ring_t   ctrl_ring;
 
 unsigned long mmap_vstart = 0;
 char *blktap_mem;
 int fd = 0;
 
-#define BLKTAP_RING_PAGES       3 /* Ctrl, Back, Front */
-/*#define BLKTAP_MMAP_PAGES       ((11 + 1) * 64)*/
-#define BLKTAP_MMAP_PAGES \
-    ((BLKIF_MAX_SEGMENTS_PER_REQUEST + 1) * BLKIF_RING_SIZE)
-#define BLKTAP_MMAP_REGION_SIZE (BLKTAP_RING_PAGES + BLKTAP_MMAP_PAGES)
+#define BLKTAP_RING_PAGES       1 /* Front */
+#define BLKTAP_MMAP_REGION_SIZE (BLKTAP_RING_PAGES + MMAP_PAGES)
     
 int bad_count = 0;
 void bad(void)
@@ -79,126 +76,13 @@ inline unsigned int ID_TO_IDX(unsigned long id)
 }
 
 inline domid_t ID_TO_DOM(unsigned long id) { return (id >> 16); }
-/*
+
 static int (*request_hook)(blkif_request_t *req) = NULL;
 static int (*response_hook)(blkif_response_t *req) = NULL;
-*/
-
-/*-----[ Request/Response hook chains.]----------------------------------*/
-
-#define HOOK_NAME_MAX 50
-        
-typedef struct ctrl_hook_st {
-    char name[HOOK_NAME_MAX];
-    int (*func)(control_msg_t *);
-    struct ctrl_hook_st *next;
-} ctrl_hook_t;
-        
-typedef struct request_hook_st {
-    char name[HOOK_NAME_MAX];
-    int (*func)(blkif_request_t *);
-    struct request_hook_st *next;
-} request_hook_t;
-
-typedef struct response_hook_st {
-    char name[HOOK_NAME_MAX];
-    int (*func)(blkif_response_t *);
-    struct response_hook_st *next;
-} response_hook_t;
-
-static ctrl_hook_t *ctrl_hook_chain = NULL;
-static request_hook_t *request_hook_chain = NULL;
-static response_hook_t *response_hook_chain = NULL;
-
-void blktap_register_ctrl_hook(char *name, int (*ch)(control_msg_t *)) 
-{
-    ctrl_hook_t *ch_ent, **c;
-    
-    ch_ent = (ctrl_hook_t *)malloc(sizeof(ctrl_hook_t));
-    if (!ch_ent) { printf("couldn't allocate a new hook\n"); exit(-1); }
-    
-    ch_ent->func  = ch;
-    ch_ent->next = NULL;
-    strncpy(ch_ent->name, name, HOOK_NAME_MAX);
-    ch_ent->name[HOOK_NAME_MAX-1] = '\0';
-    
-    c = &ctrl_hook_chain;
-    while (*c != NULL) {
-        c = &(*c)->next;
-    }
-    *c = ch_ent;
-}
-
-void blktap_register_request_hook(char *name, int (*rh)(blkif_request_t *)) 
-{
-    request_hook_t *rh_ent, **c;
-    
-    rh_ent = (request_hook_t *)malloc(sizeof(request_hook_t));
-    if (!rh_ent) { printf("couldn't allocate a new hook\n"); exit(-1); }
-    
-    rh_ent->func  = rh;
-    rh_ent->next = NULL;
-    strncpy(rh_ent->name, name, HOOK_NAME_MAX);
-    
-    c = &request_hook_chain;
-    while (*c != NULL) {
-        c = &(*c)->next;
-    }
-    *c = rh_ent;
-}
-
-void blktap_register_response_hook(char *name, int (*rh)(blkif_response_t *)) 
-{
-    response_hook_t *rh_ent, **c;
-    
-    rh_ent = (response_hook_t *)malloc(sizeof(response_hook_t));
-    if (!rh_ent) { printf("couldn't allocate a new hook\n"); exit(-1); }
-    
-    rh_ent->func  = rh;
-    rh_ent->next = NULL;
-    strncpy(rh_ent->name, name, HOOK_NAME_MAX);
-    
-    c = &response_hook_chain;
-    while (*c != NULL) {
-        c = &(*c)->next;
-    }
-    *c = rh_ent;
-}
-
-void print_hooks(void)
-{
-    request_hook_t  *req_hook;
-    response_hook_t *rsp_hook;
-    ctrl_hook_t     *ctrl_hook;
-    
-    DPRINTF("Control Hooks:\n");
-    ctrl_hook = ctrl_hook_chain;
-    while (ctrl_hook != NULL)
-    {
-        DPRINTF("  [0x%p] %s\n", ctrl_hook->func, ctrl_hook->name);
-        ctrl_hook = ctrl_hook->next;
-    }
-    
-    DPRINTF("Request Hooks:\n");
-    req_hook = request_hook_chain;
-    while (req_hook != NULL)
-    {
-        DPRINTF("  [0x%p] %s\n", req_hook->func, req_hook->name);
-        req_hook = req_hook->next;
-    }
-    
-    DPRINTF("Response Hooks:\n");
-    rsp_hook = response_hook_chain;
-    while (rsp_hook != NULL)
-    {
-        DPRINTF("  [0x%p] %s\n", rsp_hook->func, rsp_hook->name);
-        rsp_hook = rsp_hook->next;
-    }
-}
         
 /*-----[ Data to/from Backend (server) VM ]------------------------------*/
 
-
+/*
 
 inline int write_req_to_be_ring(blkif_request_t *req)
 {
@@ -214,6 +98,7 @@ inline int write_req_to_be_ring(blkif_request_t *req)
     
     return 0;
 }
+*/
 
 inline int write_rsp_to_fe_ring(blkif_response_t *rsp)
 {
@@ -230,14 +115,14 @@ inline int write_rsp_to_fe_ring(blkif_response_t *rsp)
     return 0;
 }
 
-static void apply_rsp_hooks(blkif_response_t *rsp)
+static void apply_rsp_hooks(blkif_t *blkif, blkif_response_t *rsp)
 {
     response_hook_t  *rsp_hook;
     
-    rsp_hook = response_hook_chain;
+    rsp_hook = blkif->response_hook_chain;
     while (rsp_hook != NULL)
     {
-        switch(rsp_hook->func(rsp))
+        switch(rsp_hook->func(blkif, rsp, 1))
         {
         case BLKTAP_PASS:
             break;
@@ -248,15 +133,19 @@ static void apply_rsp_hooks(blkif_response_t *rsp)
     }
 }
 
+
 static pthread_mutex_t push_mutex = PTHREAD_MUTEX_INITIALIZER;
 
-void blktap_inject_response(blkif_response_t *rsp)
+void blkif_inject_response(blkif_t *blkif, blkif_response_t *rsp)
 {
     
-    apply_rsp_hooks(rsp);
-    
+    apply_rsp_hooks(blkif, rsp);
+  
     write_rsp_to_fe_ring(rsp);
-    
+}
+
+void blktap_kick_responses(void)
+{
     pthread_mutex_lock(&push_mutex);
     
     RING_PUSH_RESPONSES(&fe_ring);
@@ -277,7 +166,7 @@ typedef struct {
     int active;
 } pollhook_t;
 
-static struct pollfd  pfd[MAX_POLLFDS+1];
+static struct pollfd  pfd[MAX_POLLFDS+2]; /* tap and store are extra */
 static pollhook_t     pollhooks[MAX_POLLFDS];
 static unsigned int   ph_freelist[MAX_POLLFDS];
 static unsigned int   ph_cons, ph_prod;
@@ -344,65 +233,65 @@ void __attribute__ ((constructor)) blktaplib_init(void)
 
 int blktap_listen(void)
 {
-    int               notify_be, notify_fe, tap_pfd;
-    
+    int notify_be, notify_fe, tap_pfd, store_pfd, xs_fd, ret;
+    struct xs_handle *h;
+    blkif_t *blkif;
+
     /* comms rings: */
     blkif_request_t  *req;
     blkif_response_t *rsp;
-    control_msg_t    *msg;
     blkif_sring_t    *sring;
-    ctrl_sring_t     *csring;
     RING_IDX          rp, i, pfd_count; 
     
     /* pending rings */
     blkif_request_t req_pending[BLKIF_RING_SIZE];
-    blkif_response_t rsp_pending[BLKIF_RING_SIZE];
+    /* blkif_response_t rsp_pending[BLKIF_RING_SIZE] */;
     
     /* handler hooks: */
     request_hook_t   *req_hook;
     response_hook_t  *rsp_hook;
-    ctrl_hook_t      *ctrl_hook;
     
     signal (SIGBUS, got_sig_bus);
     signal (SIGINT, got_sig_int);
     
-    print_hooks();
-    
+    __init_blkif();
+
     fd = open("/dev/blktap", O_RDWR);
-    if (fd == -1) {
-        printf("open failed! (%d)\n", errno);
-        goto open_failed;
-    }
+    if (fd == -1)
+        err(-1, "open failed!");
 
     blktap_mem = mmap(0, PAGE_SIZE * BLKTAP_MMAP_REGION_SIZE, 
              PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
 
-    if ((int)blktap_mem == -1) {
-        printf("mmap failed! (%d)\n", errno);
-        goto mmap_failed;
-    }
+    if ((int)blktap_mem == -1) 
+        err(-1, "mmap failed!");
 
     /* assign the rings to the mapped memory */
-    csring = (ctrl_sring_t *)blktap_mem;
-    BACK_RING_INIT(&ctrl_ring, csring, PAGE_SIZE);
-    
+/*
     sring = (blkif_sring_t *)((unsigned long)blktap_mem + PAGE_SIZE);
     FRONT_RING_INIT(&be_ring, sring, PAGE_SIZE);
-    
-    sring = (blkif_sring_t *)((unsigned long)blktap_mem + (2 *PAGE_SIZE));
+*/  
+    sring = (blkif_sring_t *)((unsigned long)blktap_mem);
     BACK_RING_INIT(&fe_ring, sring, PAGE_SIZE);
 
     mmap_vstart = (unsigned long)blktap_mem +(BLKTAP_RING_PAGES << PAGE_SHIFT);
 
+
+    /* Set up store connection and watch. */
+    h = xs_daemon_open();
+    if (h == NULL) 
+        err(-1, "xs_daemon_open");
+    
+    ret = add_blockdevice_probe_watch(h, "Domain-0");
+    if (ret != 0)
+        err(0, "adding device probewatch");
+    
     ioctl(fd, BLKTAP_IOCTL_SETMODE, BLKTAP_MODE_INTERPOSE );
 
     while(1) {
         int ret;
         
         /* build the poll list */
-        
-        DPRINTF("Building poll list.\n");
-        
         pfd_count = 0;
         for ( i=0; i < MAX_POLLFDS; i++ ) {
             pollhook_t *ph = &pollhooks[i];
@@ -415,49 +304,31 @@ int blktap_listen(void)
             }
         }
 
-        tap_pfd = pfd_count;
+        tap_pfd = pfd_count++;
         pfd[tap_pfd].fd = fd;
         pfd[tap_pfd].events = POLLIN;
 
-        DPRINTF("poll() %d fds.\n", pfd_count);
+        store_pfd = pfd_count++;
+        pfd[store_pfd].fd = xs_fileno(h);
+        pfd[store_pfd].events = POLLIN;
         
-        if ( (ret = (poll(pfd, pfd_count+1, 10000)) == 0) ) {
+        if ( (ret = (poll(pfd, pfd_count, 10000)) == 0) ) {
             if (DEBUG_RING_IDXS)
                 ioctl(fd, BLKTAP_IOCTL_PRINT_IDXS);
             continue;
         }
 
-        DPRINTF("poll returned %d\n", ret);
-
         for (i=0; i < MAX_POLLFDS; i++) {
             if ( (pollhooks[i].active ) && (pollhooks[i].pfd->revents ) )
                 pollhooks[i].func(pollhooks[i].pfd->fd);
         }
         
-        if (pfd[tap_pfd].revents) {
-            
-            /* empty the control ring */
-            rp = ctrl_ring.sring->req_prod;
-            rmb();
-            for (i = ctrl_ring.req_cons; i < rp; i++)
-            {
-                msg = RING_GET_REQUEST(&ctrl_ring, i);
+        if (pfd[store_pfd].revents) {
+            ret = xs_fire_next_watch(h);
+        }
 
-                ctrl_hook = ctrl_hook_chain;
-                while (ctrl_hook != NULL)
-                {
-                    DPRINTF("CTRL_HOOK: %s\n", ctrl_hook->name);
-                    /* We currently don't respond to ctrl messages. */
-                    ctrl_hook->func(msg);
-                    ctrl_hook = ctrl_hook->next;
-                }
-            }
-            /* Using this as a unidirectional ring. */
-            ctrl_ring.req_cons = ctrl_ring.rsp_prod_pvt = i;
-pthread_mutex_lock(&push_mutex);
-            RING_PUSH_RESPONSES(&ctrl_ring);
-pthread_mutex_unlock(&push_mutex);
-            
+        if (pfd[tap_pfd].revents) 
+        {    
             /* empty the fe_ring */
             notify_fe = 0;
             notify_be = RING_HAS_UNCONSUMED_REQUESTS(&fe_ring);
@@ -465,44 +336,62 @@ pthread_mutex_unlock(&push_mutex);
             rmb();
             for (i = fe_ring.req_cons; i != rp; i++)
             {
-                int done = 0; /* stop forwarding this request */
+                int done = 0; 
 
                 req = RING_GET_REQUEST(&fe_ring, i);
                 memcpy(&req_pending[ID_TO_IDX(req->id)], req, sizeof(*req));
                 req = &req_pending[ID_TO_IDX(req->id)];
 
-                DPRINTF("copying an fe request\n");
+                blkif = blkif_find_by_handle(ID_TO_DOM(req->id), req->handle);
 
-                req_hook = request_hook_chain;
-                while (req_hook != NULL)
+                if (blkif != NULL)
                 {
-                    DPRINTF("REQ_HOOK: %s\n", req_hook->name);
-                    switch(req_hook->func(req))
+                    req_hook = blkif->request_hook_chain;
+                    while (req_hook != NULL)
                     {
-                    case BLKTAP_RESPOND:
-                        apply_rsp_hooks((blkif_response_t *)req);
-                        write_rsp_to_fe_ring((blkif_response_t *)req);
-                        notify_fe = 1;
-                        done = 1;
-                        break;
-                    case BLKTAP_STOLEN:
-                        done = 1;
-                        break;
-                    case BLKTAP_PASS:
-                        break;
-                    default:
-                        printf("Unknown request hook return value!\n");
+                        switch(req_hook->func(blkif, req, ((i+1) == rp)))
+                        {
+                        case BLKTAP_RESPOND:
+                            apply_rsp_hooks(blkif, (blkif_response_t *)req);
+                            write_rsp_to_fe_ring((blkif_response_t *)req);
+                            notify_fe = 1;
+                            done = 1;
+                            break;
+                        case BLKTAP_STOLEN:
+                            done = 1;
+                            break;
+                        case BLKTAP_PASS:
+                            break;
+                        default:
+                            printf("Unknown request hook return value!\n");
+                        }
+                        if (done) break;
+                        req_hook = req_hook->next;
                     }
-                    if (done) break;
-                    req_hook = req_hook->next;
                 }
 
-                if (done == 0) write_req_to_be_ring(req);
+                if (done == 0) 
+                {
+                    /* this was:  */
+                    /* write_req_to_be_ring(req); */
+
+                    unsigned long id = req->id;
+                    unsigned short operation = req->operation;
+                    printf("Unterminated request!\n");
+                    rsp = (blkif_response_t *)req;
+                    rsp->id = id;
+                    rsp->operation = operation;
+                    rsp->status = BLKIF_RSP_ERROR;
+                    write_rsp_to_fe_ring(rsp);
+                    notify_fe = 1;
+                    done = 1;
+                }
 
             }
             fe_ring.req_cons = i;
 
             /* empty the be_ring */
+/*
             notify_fe |= RING_HAS_UNCONSUMED_RESPONSES(&be_ring);
             rp = be_ring.sring->rsp_prod;
             rmb();
@@ -519,9 +408,9 @@ pthread_mutex_unlock(&push_mutex);
                 write_rsp_to_fe_ring(rsp);
             }
             be_ring.rsp_cons = i;
-
+*/
             /* notify the domains */
-
+/*
             if (notify_be) {
                 DPRINTF("notifying be\n");
 pthread_mutex_lock(&push_mutex);
@@ -529,13 +418,13 @@ pthread_mutex_lock(&push_mutex);
                 ioctl(fd, BLKTAP_IOCTL_KICK_BE);
 pthread_mutex_unlock(&push_mutex);
             }
-
+*/
             if (notify_fe) {
                 DPRINTF("notifying fe\n");
-pthread_mutex_lock(&push_mutex);
+                pthread_mutex_lock(&push_mutex);
                 RING_PUSH_RESPONSES(&fe_ring);
                 ioctl(fd, BLKTAP_IOCTL_KICK_FE);
-pthread_mutex_unlock(&push_mutex);
+                pthread_mutex_unlock(&push_mutex);
             }
         }        
     }
index de9edfe2adc6eb07db1f17a71dd0d067492c7224..520165310f7e70f39ff25696511f3777159e9128 100644 (file)
@@ -2,6 +2,9 @@
  *
  * userland accessors to the block tap.
  *
+ * Sept 2/05 -- I'm scaling this back to only support block remappings
+ * to user in a backend domain.  Passthrough and interposition can be readded
+ * once transitive grants are available.
  */
  
 #ifndef __BLKTAPLIB_H__
@@ -13,6 +16,7 @@
 #include <xen/io/blkif.h>
 #include <xen/io/ring.h>
 #include <xen/io/domain_controller.h>
+#include <xs.h>
 
 /* /dev/xen/blktap resides at device number major=10, minor=202        */ 
 #define BLKTAP_MINOR 202
 
 static inline int BLKTAP_MODE_VALID(unsigned long arg)
 {
+    return (
+        ( arg == BLKTAP_MODE_PASSTHROUGH  ) ||
+        ( arg == BLKTAP_MODE_INTERCEPT_FE ) ||
+        ( arg == BLKTAP_MODE_INTERPOSE    ) );
+/*
     return (
         ( arg == BLKTAP_MODE_PASSTHROUGH  ) ||
         ( arg == BLKTAP_MODE_INTERCEPT_FE ) ||
@@ -55,6 +64,7 @@ static inline int BLKTAP_MODE_VALID(unsigned long arg)
         ( (arg & ~BLKTAP_MODE_COPY_BE_PAGES) == BLKTAP_MODE_COPY_BE ) ||
         ( (arg & ~BLKTAP_MODE_COPY_BOTH_PAGES) == BLKTAP_MODE_COPY_BOTH )
         );
+*/
 }
 
 /* Return values for handling messages in hooks. */
@@ -62,30 +72,89 @@ static inline int BLKTAP_MODE_VALID(unsigned long arg)
 #define BLKTAP_RESPOND  1 /* Request is now a reply.  Return it.  */
 #define BLKTAP_STOLEN   2 /* Hook has stolen request.             */
 
-#define domid_t unsigned short
+//#define domid_t unsigned short
 
 inline unsigned int ID_TO_IDX(unsigned long id);
 inline domid_t ID_TO_DOM(unsigned long id);
 
-void blktap_register_ctrl_hook(char *name, int (*ch)(control_msg_t *));
-void blktap_register_request_hook(char *name, int (*rh)(blkif_request_t *));
-void blktap_register_response_hook(char *name, int (*rh)(blkif_response_t *));
-void blktap_inject_response(blkif_response_t *);
 int  blktap_attach_poll(int fd, short events, int (*func)(int));
 void blktap_detach_poll(int fd);
 int  blktap_listen(void);
 
+struct blkif;
+
+typedef struct request_hook_st {
+    char *name;
+    int (*func)(struct blkif *, blkif_request_t *, int);
+    struct request_hook_st *next;
+} request_hook_t;
+
+typedef struct response_hook_st {
+    char *name;
+    int (*func)(struct blkif *, blkif_response_t *, int);
+    struct response_hook_st *next;
+} response_hook_t;
+
+struct blkif_ops {
+    long int (*get_size)(struct blkif *blkif);
+    long int (*get_secsize)(struct blkif *blkif);
+    unsigned (*get_info)(struct blkif *blkif);
+};
+
+typedef struct blkif {
+    domid_t domid;
+    long int handle;
+
+    long int pdev;
+    long int readonly;
+
+    enum { DISCONNECTED, CONNECTED } state;
+
+    struct blkif_ops *ops;
+    request_hook_t *request_hook_chain;
+    response_hook_t *response_hook_chain;
+
+    struct blkif *hash_next;
+
+    void *prv;  /* device-specific data */
+} blkif_t;
+
+void register_new_blkif_hook(int (*fn)(blkif_t *blkif));
+blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle);
+blkif_t *alloc_blkif(domid_t domid);
+int blkif_init(blkif_t *blkif, long int handle, long int pdev, 
+               long int readonly);
+void free_blkif(blkif_t *blkif);
+void __init_blkif(void);
+
+
+/* xenstore/xenbus: */
+extern int add_blockdevice_probe_watch(struct xs_handle *h, 
+                                       const char *domname);
+int xs_fire_next_watch(struct xs_handle *h);
+
+
+void blkif_print_hooks(blkif_t *blkif);
+void blkif_register_request_hook(blkif_t *blkif, char *name, 
+                             int (*rh)(blkif_t *, blkif_request_t *, int));
+void blkif_register_response_hook(blkif_t *blkif, char *name, 
+                             int (*rh)(blkif_t *, blkif_response_t *, int));
+void blkif_inject_response(blkif_t *blkif, blkif_response_t *);
+void blktap_kick_responses(void);
+
+/* this must match the underlying driver... */
+#define MAX_PENDING_REQS 64
+
 /* Accessing attached data page mappings */
-#define MMAP_PAGES_PER_REQUEST \
-    (BLKIF_MAX_SEGMENTS_PER_REQUEST + 1)
-#define MMAP_VADDR(_req,_seg)                        \
-    (mmap_vstart +                                   \
-     ((_req) * MMAP_PAGES_PER_REQUEST * PAGE_SIZE) + \
+#define MMAP_PAGES                                              \
+    (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
+#define MMAP_VADDR(_req,_seg)                                   \
+    (mmap_vstart +                                              \
+     ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) +    \
      ((_seg) * PAGE_SIZE))
 
 extern unsigned long mmap_vstart;
 
-
 /* Defines that are only used by library clients */
 
 #ifndef __COMPILING_BLKTAP_LIB
@@ -93,7 +162,6 @@ extern unsigned long mmap_vstart;
 static char *blkif_op_name[] = {
     [BLKIF_OP_READ]       = "READ",
     [BLKIF_OP_WRITE]      = "WRITE",
-    [BLKIF_OP_PROBE]      = "PROBE",
 };
 
 #endif /* __COMPILING_BLKTAP_LIB */
diff --git a/tools/blktap/list.h b/tools/blktap/list.h
new file mode 100644 (file)
index 0000000..bda5f46
--- /dev/null
@@ -0,0 +1,55 @@
+/*
+ * list.h
+ * 
+ * This is a subset of linux's list.h intended to be used in user-space.
+ * 
+ */
+
+#ifndef __LIST_H__
+#define __LIST_H__
+
+#define LIST_POISON1  ((void *) 0x00100100)
+#define LIST_POISON2  ((void *) 0x00200200)
+
+struct list_head {
+        struct list_head *next, *prev;
+};
+#define LIST_HEAD_INIT(name) { &(name), &(name) }
+#define LIST_HEAD(name) \
+        struct list_head name = LIST_HEAD_INIT(name)
+
+static inline void __list_add(struct list_head *new,
+                              struct list_head *prev,
+                              struct list_head *next)
+{
+        next->prev = new;
+        new->next = next;
+        new->prev = prev;
+        prev->next = new;
+}
+
+static inline void list_add(struct list_head *new, struct list_head *head)
+{
+        __list_add(new, head, head->next);
+}
+static inline void __list_del(struct list_head * prev, struct list_head * next)
+{
+        next->prev = prev;
+        prev->next = next;
+}
+static inline void list_del(struct list_head *entry)
+{
+        __list_del(entry->prev, entry->next);
+        entry->next = LIST_POISON1;
+        entry->prev = LIST_POISON2;
+}
+#define list_entry(ptr, type, member)                                   \
+        ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member)))
+#define list_for_each_entry(pos, head, member)                          \
+        for (pos = list_entry((head)->next, typeof(*pos), member);      \
+             &pos->member != (head);                                    \
+             pos = list_entry(pos->member.next, typeof(*pos), member))
+
+#endif /* __LIST_H__ */
diff --git a/tools/blktap/ublkback/Makefile b/tools/blktap/ublkback/Makefile
new file mode 100644 (file)
index 0000000..48d2bbf
--- /dev/null
@@ -0,0 +1,42 @@
+
+XEN_ROOT = ../../..
+include $(XEN_ROOT)/tools/Rules.mk
+
+INCLUDES += -I..
+
+INSTALL            = install
+INSTALL_PROG = $(INSTALL) -m0755
+IBIN         = ublkback
+INSTALL_DIR  = /usr/sbin
+
+CFLAGS   += -Wall
+CFLAGS   += -Werror
+CFLAGS   += -Wno-unused
+#CFLAGS   += -O3
+CFLAGS   += -g3
+CFLAGS   += -fno-strict-aliasing
+CFLAGS   += -I $(XEN_LIBXC)
+CFLAGS   += $(INCLUDES) -I.
+CFLAGS   += -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE
+# Get gcc to generate the dependencies for us.
+CFLAGS   += -Wp,-MD,.$(@F).d
+DEPS     = .*.d
+
+OBJS     = $(patsubst %.c,%.o,$(SRCS))
+
+all: $(IBIN)
+
+LINUX_ROOT := $(wildcard $(XEN_ROOT)/linux-2.6.*-xen-sparse)
+
+install:
+       $(INSTALL_PROG) $(IBIN) $(DESTDIR)$(INSTALL_DIR)
+clean:
+       rm -rf *.o*~ $(DEPS) xen TAGS $(IBIN)
+
+ublkback: 
+       $(CC) $(CFLAGS) -o ublkback -L$(XEN_LIBXC) -L. -L..  \
+             -lblktap -laio ublkback.c ublkbacklib.c -pg
+
+.PHONY: clean install
+
+-include $(DEPS)
diff --git a/tools/blktap/ublkback/ublkback.c b/tools/blktap/ublkback/ublkback.c
new file mode 100644 (file)
index 0000000..d549798
--- /dev/null
@@ -0,0 +1,18 @@
+/* ublkback.c
+ *
+ * libaio-based userlevel backend.
+ */
+
+#include "blktaplib.h"
+#include "ublkbacklib.h"
+
+
+int main(int argc, char *argv[])
+{
+    ublkback_init();
+    
+    register_new_blkif_hook(ublkback_new_blkif);
+    blktap_listen();
+    
+    return 0;
+}
diff --git a/tools/blktap/ublkback/ublkbacklib.c b/tools/blktap/ublkback/ublkbacklib.c
new file mode 100644 (file)
index 0000000..767be7b
--- /dev/null
@@ -0,0 +1,477 @@
+/* ublkbacklib.c
+ *
+ * file/device image-backed block device -- using linux libaio.
+ * 
+ * (c) 2004 Andrew Warfield.
+ *
+ * Xend has been modified to use an amorfs:[fsid] disk tag.
+ * This will show up as device type (maj:240,min:0) = 61440.
+ *
+ * The fsid is placed in the sec_start field of the disk extent.
+ *
+ * NOTE: This doesn't work.  Grrr.
+ */
+
+#define _GNU_SOURCE
+#define __USE_LARGEFILE64
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <string.h>
+#include <db.h>       
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/poll.h>
+#include <unistd.h>
+#include <errno.h>
+#include <libaio.h>
+#include <pthread.h>
+#include <time.h>
+#include <err.h>
+#include "blktaplib.h"
+
+/* XXXX:  */
+/* Current code just mounts this file/device to any requests that come in. */
+//#define TMP_IMAGE_FILE_NAME "/dev/sda1"
+#define TMP_IMAGE_FILE_NAME "fc3.image"
+
+#define MAX_REQUESTS            64 /* must be synced with the blkif drivers. */
+#define MAX_SEGMENTS_PER_REQ    11
+#define SECTOR_SHIFT             9
+#define MAX_AIO_REQS   (MAX_REQUESTS * MAX_SEGMENTS_PER_REQ)
+
+#if 0
+#define DPRINTF(_f, _a...) printf ( _f , ## _a )
+#else
+#define DPRINTF(_f, _a...) ((void)0)
+#endif
+           
+#if 1                                                                        
+#define ASSERT(_p) \
+    if ( !(_p) ) { printf("Assertion '%s' failed, line %d, file %s", #_p , \
+    __LINE__, __FILE__); *(int*)0=0; }
+#else
+#define ASSERT(_p) ((void)0)
+#endif                                                                     
+
+/* Note on pending_reqs: I assume all reqs are queued before they start to 
+ * get filled.  so count of 0 is an unused record.
+ */
+typedef struct {
+    blkif_request_t  req;
+    blkif_t         *blkif;
+    int              count;
+} pending_req_t;
+
+static pending_req_t    pending_list[MAX_REQUESTS];
+static io_context_t  ctx;
+static struct iocb  *iocb_free[MAX_AIO_REQS];
+static int           iocb_free_count;
+
+/* ---[ Notification mecahnism ]--------------------------------------- */
+
+enum { 
+    READ   = 0,
+    WRITE  = 1
+};
+
+static int aio_notify[2];
+static volatile int aio_listening = 0;
+static pthread_mutex_t notifier_sem = PTHREAD_MUTEX_INITIALIZER;
+
+static struct io_event aio_events[MAX_AIO_REQS];
+static int             aio_event_count = 0;
+
+/* this is commented out in libaio.h for some reason. */
+extern int io_queue_wait(io_context_t ctx, struct timespec *timeout);
+
+static void *notifier_thread(void *arg)
+{
+    int ret; 
+    int msg = 0x00feeb00;
+    
+    DPRINTF("Notifier thread started.\n");
+    for (;;) {
+        pthread_mutex_lock(&notifier_sem);
+        if ((ret = io_getevents(ctx, 1, MAX_AIO_REQS, aio_events, 0)) > 0) {
+            aio_event_count = ret;
+            write(aio_notify[WRITE], &msg, sizeof(msg));
+        } else {
+                printf("[io_queue_wait error! %d]\n", errno);
+                pthread_mutex_unlock(&notifier_sem);
+        }
+    }
+}
+
+/* --- Talking to xenstore: ------------------------------------------- */
+
+int ublkback_request(blkif_t *blkif, blkif_request_t *req, int batch_done);
+int ublkback_response(blkif_t *blkif, blkif_response_t *rsp, int batch_done);
+
+typedef struct image {
+    /* These need to turn into an array/rbtree for multi-disk support. */
+    int  fd;
+    u64  fsid;
+    blkif_vdev_t   vdevice;
+    long int size;
+    long int secsize;
+    long int info;
+} image_t;
+
+long int ublkback_get_size(blkif_t *blkif)
+{
+    image_t *img = (image_t *)blkif->prv;
+    return img->size;
+}
+
+long int ublkback_get_secsize(blkif_t *blkif)
+{
+    image_t *img = (image_t *)blkif->prv;
+    return img->secsize;
+}
+
+unsigned ublkback_get_info(blkif_t *blkif)
+{
+    image_t *img = (image_t *)blkif->prv;
+    return img->info;
+}
+
+static struct blkif_ops ublkback_ops = {
+    get_size:    ublkback_get_size,
+    get_secsize: ublkback_get_secsize,
+    get_info:    ublkback_get_info,
+};
+
+int ublkback_new_blkif(blkif_t *blkif)
+{
+    image_t *image;
+    struct stat stat;
+    int ret;
+
+    image = (image_t *)malloc(sizeof(image_t));
+    if (image == NULL) {
+        printf("error allocating image record.\n");
+        return -ENOMEM;
+    }
+
+    /* Open it. */
+    image->fd = open(TMP_IMAGE_FILE_NAME, 
+                     O_RDWR | O_DIRECT | O_LARGEFILE);
+
+    if ((image->fd < 0) && (errno == EINVAL)) {
+        /* Maybe O_DIRECT isn't supported. */
+        warn("open() failed on '%s', trying again without O_DIRECT",
+               TMP_IMAGE_FILE_NAME);
+        image->fd = open(TMP_IMAGE_FILE_NAME, O_RDWR | O_LARGEFILE);
+    }
+
+    if (image->fd < 0) {
+        warn("Couldn't open image file!");
+        free(image);
+        return -EINVAL;
+    }
+
+    /* Size it. */
+    ret = fstat(image->fd, &stat);
+    if (ret != 0) {
+        printf("Couldn't stat image in PROBE!");
+        return -EINVAL;
+    }
+    
+    image->size = (stat.st_size >> SECTOR_SHIFT);
+
+    /* TODO: IOCTL to get size of raw device. */
+/*
+  ret = ioctl(img->fd, BLKGETSIZE, &blksize);
+  if (ret != 0) {
+  printf("Couldn't ioctl image in PROBE!\n");
+  goto err;
+  }
+*/
+    if (image->size == 0)
+        image->size =((u64) 16836057);
+    image->secsize = 512;
+    image->info = 0;
+
+    /* Register the hooks */
+    blkif_register_request_hook(blkif, "Ublkback req.", ublkback_request);
+    blkif_register_response_hook(blkif, "Ublkback resp.", ublkback_response);
+
+
+    printf(">X<Created a new blkif! pdev was %ld, but you got %s\n", 
+           blkif->pdev, TMP_IMAGE_FILE_NAME);
+
+    blkif->ops = &ublkback_ops;
+    blkif->prv = (void *)image;
+
+    return 0;
+}
+
+
+/* --- Moving the bits: ----------------------------------------------- */
+
+static int batch_count = 0;
+int ublkback_request(blkif_t *blkif, blkif_request_t *req, int batch_done)
+{
+    int fd;
+    u64 sector;
+    char *spage, *dpage;
+    int ret, i, idx;
+    blkif_response_t *rsp;
+    domid_t dom = ID_TO_DOM(req->id);
+    static struct iocb *ioq[MAX_SEGMENTS_PER_REQ*MAX_REQUESTS]; 
+    static int io_idx = 0;
+    struct iocb *io;
+    image_t *img;
+
+    img = (image_t *)blkif->prv;
+    fd = img->fd;
+
+    switch (req->operation) 
+    {
+    case BLKIF_OP_WRITE:
+    {
+        unsigned long size;
+        
+        
+        batch_count++;
+
+        idx = ID_TO_IDX(req->id);
+        ASSERT(pending_list[idx].count == 0);
+        memcpy(&pending_list[idx].req, req, sizeof(*req));
+        pending_list[idx].count = req->nr_segments;
+        pending_list[idx].blkif = blkif;
+        
+        for (i = 0; i < req->nr_segments; i++) {
+            
+            sector = req->sector_number + (8*i);
+            
+            size = blkif_last_sect (req->frame_and_sects[i]) -
+                   blkif_first_sect(req->frame_and_sects[i]) + 1;
+            
+            if (blkif_first_sect(req->frame_and_sects[i]) != 0)
+            DPRINTF("iWR: sec_nr: %10llu sec: %10llu (%1lu,%1lu) pos: %15lu\n",
+                    req->sector_number, sector, 
+                    blkif_first_sect(req->frame_and_sects[i]),
+                    blkif_last_sect (req->frame_and_sects[i]),
+                    (long)(sector << SECTOR_SHIFT));
+                        
+            spage  = (char *)MMAP_VADDR(ID_TO_IDX(req->id), i);
+            spage += blkif_first_sect(req->frame_and_sects[i]) << SECTOR_SHIFT;
+            
+            /*convert size and sector to byte offsets */
+            size   <<= SECTOR_SHIFT;
+            sector <<= SECTOR_SHIFT;
+            
+            io = iocb_free[--iocb_free_count];
+            io_prep_pwrite(io, fd, spage, size, sector);
+            io->data = (void *)idx;
+            //ioq[i] = io;
+            ioq[io_idx++] = io;
+        }
+
+        if (batch_done) {
+            ret = io_submit(ctx, io_idx, ioq);
+            batch_count = 0;
+            if (ret < 0)
+                printf("BADNESS: io_submit error! (%d)\n", errno);
+            io_idx = 0;
+        }
+        
+        return BLKTAP_STOLEN;
+        
+    }
+    case BLKIF_OP_READ:
+    {
+        unsigned long size;
+        
+        batch_count++;
+        idx = ID_TO_IDX(req->id);
+        ASSERT(pending_list[idx].count == 0);
+        memcpy(&pending_list[idx].req, req, sizeof(*req));
+        pending_list[idx].count = req->nr_segments;
+        pending_list[idx].blkif = blkif;
+        
+        for (i = 0; i < req->nr_segments; i++) {
+            
+            sector  = req->sector_number + (8*i);
+            
+            size = blkif_last_sect (req->frame_and_sects[i]) -
+                   blkif_first_sect(req->frame_and_sects[i]) + 1;
+            
+            dpage  = (char *)MMAP_VADDR(ID_TO_IDX(req->id), i);
+            dpage += blkif_first_sect(req->frame_and_sects[i]) << SECTOR_SHIFT;
+            
+            if (blkif_first_sect(req->frame_and_sects[i]) != 0)
+            DPRINTF("iRD : sec_nr: %10llu sec: %10llu (%1lu,%1lu) "
+                    "pos: %15lu dpage: %p\n", 
+                    req->sector_number, sector, 
+                    blkif_first_sect(req->frame_and_sects[i]),
+                    blkif_last_sect (req->frame_and_sects[i]),
+                    (long)(sector << SECTOR_SHIFT), dpage);
+            
+            /*convert size and sector to byte offsets */
+            size   <<= SECTOR_SHIFT;
+            sector <<= SECTOR_SHIFT;
+            
+
+            /*
+             * NB: Looks like AIO now has non-page aligned support, this path 
+             * can probably be removed...  Only really used for hunting
+             * superblocks anyway... ;)
+             */
+            if ( ((unsigned long)dpage % PAGE_SIZE) != 0 ) {
+                /* AIO to raw devices must be page aligned, so do this read
+                 * synchronously.  The OS is probably just looking for 
+                 * a superblock or something, so this won't hurt performance. 
+                 */
+                int ret;
+
+                printf("Slow path block read.\n");
+                /* Question: do in-progress aio ops modify the file cursor? */
+                ret = lseek(fd, sector, SEEK_SET);
+                if (ret == (off_t)-1)
+                    printf("lseek failed!\n");
+                ret = read(fd, dpage, size);
+                if (ret < 0)
+                    printf("read problem (%d)\n", ret);
+                printf("|\n|\n| read: %lld, %lu, %d\n|\n|\n", sector, size, ret);
+
+                /* not an async request any more... */
+                pending_list[idx].count--;
+
+                rsp = (blkif_response_t *)req;
+                rsp->id = req->id;
+                rsp->operation = BLKIF_OP_READ;
+                rsp->status = BLKIF_RSP_OKAY;
+                return BLKTAP_RESPOND;  
+                /* Doh -- need to flush aio if this is end-of-batch */
+            }
+
+            io = iocb_free[--iocb_free_count];
+            
+            io_prep_pread(io, fd, dpage, size, sector);
+            io->data = (void *)idx;
+            
+            ioq[io_idx++] = io;
+            //ioq[i] = io;
+        }
+        
+        if (batch_done) {
+            ret = io_submit(ctx, io_idx, ioq);
+            batch_count = 0;
+            if (ret < 0)
+                printf("BADNESS: io_submit error! (%d)\n", errno);
+            io_idx = 0;
+        }
+        
+        return BLKTAP_STOLEN;
+        
+    }
+    }
+    
+    printf("Unknown block operation!\n");
+err:
+    rsp = (blkif_response_t *)req;
+    rsp->id = req->id;
+    rsp->operation = req->operation;
+    rsp->status = BLKIF_RSP_ERROR;
+    return BLKTAP_RESPOND;  
+}
+
+
+int ublkback_pollhook(int fd)
+{
+    struct io_event *ep;
+    int n, ret, idx;
+    blkif_request_t *req;
+    blkif_response_t *rsp;
+    int responses_queued = 0;
+    int pages=0;
+    
+    for (ep = aio_events; aio_event_count-- > 0; ep++) {
+        struct iocb *io = ep->obj;
+        idx = (int) ep->data;
+        
+        if ((idx > MAX_REQUESTS-1) || (pending_list[idx].count == 0)){
+            printf("invalid index returned(%u)!\n", idx);
+            break;
+        }
+        
+        if ((int)ep->res < 0) 
+            printf("***\n***aio request error! (%d,%d)\n***\n", 
+                   (int)ep->res, (int)ep->res2);
+        
+        pending_list[idx].count--;
+        iocb_free[iocb_free_count++] = io;
+        pages++;
+
+        if (pending_list[idx].count == 0) {
+            blkif_request_t tmp = pending_list[idx].req;
+            rsp = (blkif_response_t *)&pending_list[idx].req;
+            rsp->id = tmp.id;
+            rsp->operation = tmp.operation;
+            rsp->status = BLKIF_RSP_OKAY;
+            blkif_inject_response(pending_list[idx].blkif, rsp);
+            responses_queued++;
+        }
+    }
+
+    if (responses_queued) {
+        blktap_kick_responses();
+    }
+    
+    read(aio_notify[READ], &idx, sizeof(idx));
+    aio_listening = 1;
+    pthread_mutex_unlock(&notifier_sem);
+    
+    return 0;
+}
+
+/* the image library terminates the request stream. _resp is a noop. */
+int ublkback_response(blkif_t *blkif, blkif_response_t *rsp, int batch_done)
+{   
+    return BLKTAP_PASS;
+}
+
+void ublkback_init(void)
+{
+    int i, rc;
+    pthread_t p;
+    
+    for (i = 0; i < MAX_REQUESTS; i++)
+        pending_list[i].count = 0; 
+    
+    memset(&ctx, 0, sizeof(ctx));
+    rc = io_queue_init(MAX_AIO_REQS, &ctx);
+    if (rc != 0) {
+        printf("queue_init failed! (%d)\n", rc);
+        exit(0);
+    }
+    
+    for (i=0; i<MAX_AIO_REQS; i++) {
+        if (!(iocb_free[i] = (struct iocb *)malloc(sizeof(struct iocb)))) {
+            printf("error allocating iocb array\n");
+            exit(0);
+        }
+        iocb_free_count = i;
+    }
+    
+    rc = pipe(aio_notify);
+    if (rc != 0) {
+        printf("pipe failed! (%d)\n", errno);
+        exit(0);
+    }
+    
+    rc = pthread_create(&p, NULL, notifier_thread, NULL);
+    if (rc != 0) {
+        printf("pthread_create failed! (%d)\n", errno);
+        exit(0);
+    }
+    
+    aio_listening = 1;
+    
+    blktap_attach_poll(aio_notify[READ], POLLIN, ublkback_pollhook);
+}
+
diff --git a/tools/blktap/ublkback/ublkbacklib.h b/tools/blktap/ublkback/ublkbacklib.h
new file mode 100644 (file)
index 0000000..f12b988
--- /dev/null
@@ -0,0 +1,16 @@
+/* blkaiolib.h
+ *
+ * aio image-backed block device.
+ * 
+ * (c) 2004 Andrew Warfield.
+ *
+ * Xend has been modified to use an amorfs:[fsid] disk tag.
+ * This will show up as device type (maj:240,min:0) = 61440.
+ *
+ * The fsid is placed in the sec_start field of the disk extent.
+ */
+
+int  ublkback_request(blkif_request_t *req, int batch_done);
+int  ublkback_response(blkif_response_t *rsp); /* noop */
+int  ublkback_new_blkif(blkif_t *blkif);
+void ublkback_init(void);
diff --git a/tools/blktap/xenbus.c b/tools/blktap/xenbus.c
new file mode 100644 (file)
index 0000000..39d037a
--- /dev/null
@@ -0,0 +1,578 @@
+/*
+ * xenbus.c
+ * 
+ * xenbus interface to the blocktap.
+ * 
+ * this handles the top-half of integration with block devices through the
+ * store -- the tap driver negotiates the device channel etc, while the
+ * userland tap clinet needs to sort out the disk parameters etc.
+ * 
+ * A. Warfield 2005 Based primarily on the blkback and xenbus driver code.  
+ * Comments there apply here...
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <err.h>
+#include <stdarg.h>
+#include <errno.h>
+#include <xs.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <poll.h>
+#include "blktaplib.h"
+#include "list.h"
+
+#if 0
+#define DPRINTF(_f, _a...) printf ( _f , ## _a )
+#else
+#define DPRINTF(_f, _a...) ((void)0)
+#endif
+
+/* --- Xenstore / Xenbus helpers ---------------------------------------- */
+/*
+ * These should all be pulled out into the xenstore API.  I'm faulting commands
+ * in from the xenbus interface as i need them.
+ */
+
+
+/* Takes tuples of names, scanf-style args, and void **, NULL terminated. */
+int xs_gather(struct xs_handle *xs, const char *dir, ...)
+{
+    va_list ap;
+    const char *name;
+    char *path;
+    int ret = 0;
+    
+    va_start(ap, dir);
+    while (ret == 0 && (name = va_arg(ap, char *)) != NULL) {
+        const char *fmt = va_arg(ap, char *);
+        void *result = va_arg(ap, void *);
+        char *p;
+        
+        if (asprintf(&path, "%s/%s", dir, name) == -1)
+        {
+            warn("allocation error in xs_gather!\n");
+            ret = ENOMEM;
+            break;
+        }
+        p = xs_read(xs, path, NULL);
+        free(path);
+        if (p == NULL) {
+            ret = ENOENT;
+            break;
+        }
+        if (fmt) {
+            if (sscanf(p, fmt, result) == 0)
+                ret = EINVAL;
+            free(p);
+        } else
+            *(char **)result = p;
+    }
+    va_end(ap);
+    return ret;
+}
+
+/* Single printf and write: returns -errno or 0. */
+int xs_printf(struct xs_handle *h, const char *dir, const char *node, 
+                  const char *fmt, ...)
+{
+        char *buf, *path;
+        va_list ap;
+        int ret;
+        va_start(ap, fmt);
+        ret = vasprintf(&buf, fmt, ap);
+        va_end(ap);
+        asprintf(&path, "%s/%s", dir, node);
+
+        if ((path == NULL) || (buf == NULL))
+            return 0;
+
+        ret = xs_write(h, path, buf, strlen(buf)+1, O_CREAT);
+
+        free(buf);
+        free(path);
+
+        return ret;
+}
+
+
+int xs_exists(struct xs_handle *h, const char *path)
+{
+    char **d;
+    int num;
+
+    d = xs_directory(h, path, &num);
+    if (d == NULL)
+        return 0;
+    free(d);
+    return 1;
+}
+
+
+
+/* This assumes that the domain name we are looking for is unique! */
+char *get_dom_uuid(struct xs_handle *h, const char *name)
+{
+    char **e, *val, *uuid = NULL;
+    int num, i, len;
+    char *path;
+
+    e = xs_directory(h, "/domain", &num);
+
+    i=0;
+    while (i < num) {
+        asprintf(&path, "/domain/%s/name", e[i]);
+        val = xs_read(h, path, &len);
+        free(path);
+        if (val == NULL)
+            continue;
+        if (strcmp(val, name) == 0) {
+            /* match! */
+            asprintf(&path, "/domain/%s/uuid", e[i]);
+            uuid = xs_read(h, path, &len);
+            free(val);
+            free(path);
+            break;
+        }
+        free(val);
+        i++;
+    }
+
+    free(e);
+    return uuid;
+}
+
+static int strsep_len(const char *str, char c, unsigned int len)
+{
+    unsigned int i;
+    
+    for (i = 0; str[i]; i++)
+        if (str[i] == c) {
+            if (len == 0)
+                return i;
+            len--;
+        }
+    return (len == 0) ? i : -ERANGE;
+}
+
+
+/* xenbus watches: */     
+/* Register callback to watch this node. */
+struct xenbus_watch
+{
+        struct list_head list;
+        char *node;
+        void (*callback)(struct xs_handle *h, 
+                         struct xenbus_watch *, 
+                         const  char *node);
+};
+
+static LIST_HEAD(watches);
+
+/* A little paranoia: we don't just trust token. */
+static struct xenbus_watch *find_watch(const char *token)
+{
+    struct xenbus_watch *i, *cmp;
+    
+    cmp = (void *)strtoul(token, NULL, 16);
+    
+    list_for_each_entry(i, &watches, list)
+        if (i == cmp)
+            return i;
+    return NULL;
+}
+
+/* Register callback to watch this node. like xs_watch, return 0 on failure */
+int register_xenbus_watch(struct xs_handle *h, struct xenbus_watch *watch)
+{
+    /* Pointer in ascii is the token. */
+    char token[sizeof(watch) * 2 + 1];
+    int er;
+    
+    sprintf(token, "%lX", (long)watch);
+    if (find_watch(token)) 
+    {
+        warn("watch collision!");
+        return -EINVAL;
+    }
+    
+    er = xs_watch(h, watch->node, token);
+    if (er != 0) {
+        list_add(&watch->list, &watches);
+    } 
+        
+    return er;
+}
+
+int unregister_xenbus_watch(struct xs_handle *h, struct xenbus_watch *watch)
+{
+    char token[sizeof(watch) * 2 + 1];
+    int er;
+    
+    sprintf(token, "%lX", (long)watch);
+    if (!find_watch(token))
+    {
+        warn("no such watch!");
+        return -EINVAL;
+    }
+    
+    
+    er = xs_unwatch(h, watch->node, token);
+    list_del(&watch->list);
+    
+    if (er == 0)
+        warn("XENBUS Failed to release watch %s: %i",
+             watch->node, er);
+    return 0;
+}
+
+/* Re-register callbacks to all watches. */
+void reregister_xenbus_watches(struct xs_handle *h)
+{
+    struct xenbus_watch *watch;
+    char token[sizeof(watch) * 2 + 1];
+    
+    list_for_each_entry(watch, &watches, list) {
+        sprintf(token, "%lX", (long)watch);
+        xs_watch(h, watch->node, token);
+    }
+}
+
+/* based on watch_thread() */
+int xs_fire_next_watch(struct xs_handle *h)
+{
+    char **res;
+    char *token;
+    char *node = NULL;
+    struct xenbus_watch *w;
+    int er;
+
+    res = xs_read_watch(h);
+    if (res == NULL) 
+        return -EAGAIN; /* in O_NONBLOCK, read_watch returns 0... */
+
+    node  = res[0];
+    token = res[1];
+
+    er = xs_acknowledge_watch(h, token);
+    if (er == 0)
+        warn("Couldn't acknowledge watch (%s)", token);
+
+    w = find_watch(token);
+    if (!w)
+    {
+        warn("unregistered watch fired");
+        goto done;
+    }
+    w->callback(h, w, node);
+
+ done:
+    free(res);
+    return 1;
+}
+
+
+
+
+/* ---------------------------------------------------------------------- */
+
+struct backend_info
+{
+    /* our communications channel */
+    blkif_t *blkif;
+    
+    long int frontend_id;
+    long int pdev;
+    long int readonly;
+    
+    /* watch back end for changes */
+    struct xenbus_watch backend_watch;
+    char *backpath;
+
+    /* watch front end for changes */
+    struct xenbus_watch watch;
+    char *frontpath;
+
+    struct list_head list;
+};
+
+static LIST_HEAD(belist);
+
+static struct backend_info *be_lookup_be(const char *bepath)
+{
+    struct backend_info *be;
+
+    list_for_each_entry(be, &belist, list)
+        if (strcmp(bepath, be->backpath) == 0)
+            return be;
+    return (struct backend_info *)NULL;
+}
+
+static int be_exists_be(const char *bepath)
+{
+    return ( be_lookup_be(bepath) != NULL );
+}
+
+static struct backend_info *be_lookup_fe(const char *fepath)
+{
+    struct backend_info *be;
+
+    list_for_each_entry(be, &belist, list)
+        if (strcmp(fepath, be->frontpath) == 0)
+            return be;
+    return (struct backend_info *)NULL;
+}
+
+static int backend_remove(struct xs_handle *h, struct backend_info *be)
+{
+    /* Turn off watches. */
+    if (be->watch.node)
+        unregister_xenbus_watch(h, &be->watch);
+    if (be->backend_watch.node)
+        unregister_xenbus_watch(h, &be->backend_watch);
+
+    /* Unhook from be list. */
+    list_del(&be->list);
+
+    /* Free everything else. */
+    if (be->blkif)
+        free_blkif(be->blkif);
+    if (be->frontpath)
+        free(be->frontpath);
+    if (be->backpath)
+        free(be->backpath);
+    free(be);
+    return 0;
+}
+
+static void frontend_changed(struct xs_handle *h, struct xenbus_watch *w, 
+                     const char *fepath_im)
+{
+    struct backend_info *be;
+    char *fepath = NULL;
+    int er;
+
+    be = be_lookup_fe(w->node);
+    if (be == NULL)
+    {
+        warn("frontend changed called for nonexistent backend! (%s)", fepath);
+        goto fail;
+    }
+    
+    /* If other end is gone, delete ourself. */
+    if (w->node && !xs_exists(h, be->frontpath)) {
+        DPRINTF("DELETING BE: %s\n", be->backpath);
+        backend_remove(h, be);
+        return;
+    }
+
+    if (be->blkif == NULL || (be->blkif->state == CONNECTED))
+        return;
+
+    /* Supply the information about the device the frontend needs */
+    er = xs_transaction_start(h, be->backpath);
+    if (er == 0) {
+        warn("starting transaction");
+        goto fail;
+    }
+    
+    er = xs_printf(h, be->backpath, "sectors", "%lu",
+                           be->blkif->ops->get_size(be->blkif));
+    if (er == 0) {
+        warn("writing sectors");
+        goto fail;
+    }
+    
+    er = xs_printf(h, be->backpath, "info", "%u",
+                           be->blkif->ops->get_info(be->blkif));
+    if (er == 0) {
+        warn("writing info");
+        goto fail;
+    }
+    
+    er = xs_printf(h, be->backpath, "sector-size", "%lu",
+                           be->blkif->ops->get_secsize(be->blkif));
+    if (er == 0) {
+        warn("writing sector-size");
+        goto fail;
+    }
+
+    be->blkif->state = CONNECTED;
+
+    xs_transaction_end(h, 0);
+
+    return;
+
+ fail:
+    if (fepath)
+        free(fepath);
+}
+
+
+static void backend_changed(struct xs_handle *h, struct xenbus_watch *w, 
+                     const char *bepath_im)
+{
+    struct backend_info *be;
+    char *path = NULL, *p;
+    int len, er;
+    long int pdev = 0, handle;
+
+    be = be_lookup_be(w->node);
+    if (be == NULL)
+    {
+        warn("backend changed called for nonexistent backend! (%s)", w->node);
+        goto fail;
+    }
+    
+    er = xs_gather(h, be->backpath, "physical-device", "%li", &pdev, NULL);
+    if (er != 0) 
+        goto fail;
+
+    if (be->pdev && be->pdev != pdev) {
+        warn("changing physical-device not supported");
+        goto fail;
+    }
+    be->pdev = pdev;
+
+    asprintf(&path, "%s/%s", w->node, "read-only");
+    if (xs_exists(h, path))
+        be->readonly = 1;
+
+    if (be->blkif == NULL) {
+        /* Front end dir is a number, which is used as the handle. */
+        p = strrchr(be->frontpath, '/') + 1;
+        handle = strtoul(p, NULL, 0);
+
+        be->blkif = alloc_blkif(be->frontend_id);
+        if (be->blkif == NULL) 
+            goto fail;
+
+        er = blkif_init(be->blkif, handle, be->pdev, be->readonly);
+        if (er) 
+            goto fail;
+
+        DPRINTF("[BECHG]: ADDED A NEW BLKIF (%s)\n", w->node);
+
+        /* Pass in NULL node to skip exist test. */
+        frontend_changed(h, &be->watch, NULL);
+    }
+
+ fail:
+    if (path)
+        free(path);
+
+}
+
+static void blkback_probe(struct xs_handle *h, struct xenbus_watch *w, 
+                         const char *bepath_im)
+{
+       struct backend_info *be = NULL;
+       char *frontend = NULL, *bepath = NULL;
+       int er, len;
+
+        bepath = strdup(bepath_im);
+        if (!bepath)
+            return;
+        len = strsep_len(bepath, '/', 6);
+        if (len < 0) 
+            goto free_be;
+        
+        bepath[len] = '\0'; /*truncate the passed-in string with predjudice. */
+
+       be = malloc(sizeof(*be));
+       if (!be) {
+               warn("allocating backend structure");
+               goto free_be;
+       }
+       memset(be, 0, sizeof(*be));
+
+       frontend = NULL;
+       er = xs_gather(h, bepath,
+                        "frontend-id", "%li", &be->frontend_id,
+                        "frontend", NULL, &frontend,
+                        NULL);
+       if (er)
+               goto free_be;
+
+       if (strlen(frontend) == 0 || !xs_exists(h, frontend)) {
+            /* If we can't get a frontend path and a frontend-id,
+             * then our bus-id is no longer valid and we need to
+             * destroy the backend device.
+             */
+            DPRINTF("No frontend (%s)\n", frontend);
+            goto free_be;
+       }
+
+        /* Are we already tracking this device? */
+        if (be_exists_be(bepath))
+            goto free_be;
+
+        be->backpath = bepath;
+       be->backend_watch.node = be->backpath;
+       be->backend_watch.callback = backend_changed;
+       er = register_xenbus_watch(h, &be->backend_watch);
+       if (er == 0) {
+               be->backend_watch.node = NULL;
+               warn("error adding backend watch on %s", bepath);
+               goto free_be;
+       }
+
+       be->frontpath = frontend;
+       be->watch.node = be->frontpath;
+       be->watch.callback = frontend_changed;
+       er = register_xenbus_watch(h, &be->watch);
+       if (er == 0) {
+               be->watch.node = NULL;
+               warn("adding frontend watch on %s", be->frontpath);
+               goto free_be;
+       }
+
+        list_add(&be->list, &belist);
+
+        DPRINTF("[PROBE]: ADDED NEW DEVICE (%s)\n", bepath_im);
+
+       backend_changed(h, &be->backend_watch, bepath);
+       return;
+
+ free_be:
+       if ((be) && (be->backend_watch.node))
+            unregister_xenbus_watch(h, &be->backend_watch);
+       if (frontend)
+            free(frontend);
+        if (bepath)
+            free(bepath);
+       free(be);
+       return;
+}
+
+
+int add_blockdevice_probe_watch(struct xs_handle *h, const char *domname)
+{
+    char *uuid, *path;
+    struct xenbus_watch *vbd_watch;
+    int er;
+
+    uuid = get_dom_uuid(h, domname);
+
+    DPRINTF("%s: %s\n", domname, (uuid != NULL) ? uuid : "[ not found! ]");
+
+    asprintf(&path, "/domain/%s/backend/vbd", uuid);
+    if (path == NULL) 
+        return -ENOMEM;
+
+    vbd_watch = (struct xenbus_watch *)malloc(sizeof(struct xenbus_watch));
+    vbd_watch->node     = path;
+    vbd_watch->callback = blkback_probe;
+    er = register_xenbus_watch(h, vbd_watch);
+    if (er == 0) {
+        warn("Error adding vbd probe watch %s", path);
+        return -EINVAL;
+    }
+
+    return 0;
+}